bcache: A block layer cache
Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com>
This commit is contained in:
		
					parent
					
						
							
								ea6749c705
							
						
					
				
			
			
				commit
				
					
						cafe563591
					
				
			
		
					 36 changed files with 16474 additions and 0 deletions
				
			
		
							
								
								
									
										156
									
								
								Documentation/ABI/testing/sysfs-block-bcache
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										156
									
								
								Documentation/ABI/testing/sysfs-block-bcache
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,156 @@ | |||
| What:		/sys/block/<disk>/bcache/unregister | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		A write to this file causes the backing device or cache to be | ||||
| 		unregistered. If a backing device had dirty data in the cache, | ||||
| 		writeback mode is automatically disabled and all dirty data is | ||||
| 		flushed before the device is unregistered. Caches unregister | ||||
| 		all associated backing devices before unregistering themselves. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/clear_stats | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		Writing to this file resets all the statistics for the device. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/cache | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a backing device that has cache, a symlink to | ||||
| 		the bcache/ dir of that cache. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/cache_hits | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: integer number of full cache hits, | ||||
| 		counted per bio. A partial cache hit counts as a miss. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/cache_misses | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: integer number of cache misses. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/cache_hit_ratio | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: cache hits as a percentage. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/sequential_cutoff | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: Threshold past which sequential IO will | ||||
| 		skip the cache. Read and written as bytes in human readable | ||||
| 		units (i.e. echo 10M > sequntial_cutoff). | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/bypassed | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		Sum of all reads and writes that have bypassed the cache (due | ||||
| 		to the sequential cutoff).  Expressed as bytes in human | ||||
| 		readable units. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/writeback | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: When on, writeback caching is enabled and | ||||
| 		writes will be buffered in the cache. When off, caching is in | ||||
| 		writethrough mode; reads and writes will be added to the | ||||
| 		cache but no write buffering will take place. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/writeback_running | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: when off, dirty data will not be written | ||||
| 		from the cache to the backing device. The cache will still be | ||||
| 		used to buffer writes until it is mostly full, at which point | ||||
| 		writes transparently revert to writethrough mode. Intended only | ||||
| 		for benchmarking/testing. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/writeback_delay | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: In writeback mode, when dirty data is | ||||
| 		written to the cache and the cache held no dirty data for that | ||||
| 		backing device, writeback from cache to backing device starts | ||||
| 		after this delay, expressed as an integer number of seconds. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/writeback_percent | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For backing devices: If nonzero, writeback from cache to | ||||
| 		backing device only takes place when more than this percentage | ||||
| 		of the cache is used, allowing more write coalescing to take | ||||
| 		place and reducing total number of writes sent to the backing | ||||
| 		device. Integer between 0 and 40. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/synchronous | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, a boolean that allows synchronous mode to be | ||||
| 		switched on and off. In synchronous mode all writes are ordered | ||||
| 		such that the cache can reliably recover from unclean shutdown; | ||||
| 		if disabled bcache will not generally wait for writes to | ||||
| 		complete but if the cache is not shut down cleanly all data | ||||
| 		will be discarded from the cache. Should not be turned off with | ||||
| 		writeback caching enabled. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/discard | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, a boolean allowing discard/TRIM to be turned off | ||||
| 		or back on if the device supports it. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/bucket_size | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, bucket size in human readable units, as set at | ||||
| 		cache creation time; should match the erase block size of the | ||||
| 		SSD for optimal performance. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/nbuckets | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, the number of usable buckets. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/tree_depth | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, height of the btree excluding leaf nodes (i.e. a | ||||
| 		one node tree will have a depth of 0). | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/btree_cache_size | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		Number of btree buckets/nodes that are currently cached in | ||||
| 		memory; cache dynamically grows and shrinks in response to | ||||
| 		memory pressure from the rest of the system. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/written | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, total amount of data in human readable units | ||||
| 		written to the cache, excluding all metadata. | ||||
| 
 | ||||
| What:		/sys/block/<disk>/bcache/btree_written | ||||
| Date:		November 2010 | ||||
| Contact:	Kent Overstreet <kent.overstreet@gmail.com> | ||||
| Description: | ||||
| 		For a cache, sum of all btree writes in human readable units. | ||||
							
								
								
									
										343
									
								
								Documentation/bcache.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										343
									
								
								Documentation/bcache.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,343 @@ | |||
| Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be | ||||
| nice if you could use them as cache... Hence bcache. | ||||
| 
 | ||||
| Wiki and git repositories are at: | ||||
|   http://bcache.evilpiepirate.org | ||||
|   http://evilpiepirate.org/git/linux-bcache.git | ||||
|   http://evilpiepirate.org/git/bcache-tools.git | ||||
| 
 | ||||
| It's designed around the performance characteristics of SSDs - it only allocates | ||||
| in erase block sized buckets, and it uses a hybrid btree/log to track cached | ||||
| extants (which can be anywhere from a single sector to the bucket size). It's | ||||
| designed to avoid random writes at all costs; it fills up an erase block | ||||
| sequentially, then issues a discard before reusing it. | ||||
| 
 | ||||
| Both writethrough and writeback caching are supported. Writeback defaults to | ||||
| off, but can be switched on and off arbitrarily at runtime. Bcache goes to | ||||
| great lengths to protect your data - it reliably handles unclean shutdown. (It | ||||
| doesn't even have a notion of a clean shutdown; bcache simply doesn't return | ||||
| writes as completed until they're on stable storage). | ||||
| 
 | ||||
| Writeback caching can use most of the cache for buffering writes - writing | ||||
| dirty data to the backing device is always done sequentially, scanning from the | ||||
| start to the end of the index. | ||||
| 
 | ||||
| Since random IO is what SSDs excel at, there generally won't be much benefit | ||||
| to caching large sequential IO. Bcache detects sequential IO and skips it; | ||||
| it also keeps a rolling average of the IO sizes per task, and as long as the | ||||
| average is above the cutoff it will skip all IO from that task - instead of | ||||
| caching the first 512k after every seek. Backups and large file copies should | ||||
| thus entirely bypass the cache. | ||||
| 
 | ||||
| In the event of a data IO error on the flash it will try to recover by reading | ||||
| from disk or invalidating cache entries.  For unrecoverable errors (meta data | ||||
| or dirty data), caching is automatically disabled; if dirty data was present | ||||
| in the cache it first disables writeback caching and waits for all dirty data | ||||
| to be flushed. | ||||
| 
 | ||||
| Getting started: | ||||
| You'll need make-bcache from the bcache-tools repository. Both the cache device | ||||
| and backing device must be formatted before use. | ||||
|   make-bcache -B /dev/sdb | ||||
|   make-bcache -C /dev/sdc | ||||
| 
 | ||||
| make-bcache has the ability to format multiple devices at the same time - if | ||||
| you format your backing devices and cache device at the same time, you won't | ||||
| have to manually attach: | ||||
|   make-bcache -B /dev/sda /dev/sdb -C /dev/sdc | ||||
| 
 | ||||
| To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: | ||||
| 
 | ||||
|   echo /dev/sdb > /sys/fs/bcache/register | ||||
|   echo /dev/sdc > /sys/fs/bcache/register | ||||
| 
 | ||||
| To register your bcache devices automatically, you could add something like | ||||
| this to an init script: | ||||
| 
 | ||||
|   echo /dev/sd* > /sys/fs/bcache/register_quiet | ||||
| 
 | ||||
| It'll look for bcache superblocks and ignore everything that doesn't have one. | ||||
| 
 | ||||
| Registering the backing device makes the bcache show up in /dev; you can now | ||||
| format it and use it as normal. But the first time using a new bcache device, | ||||
| it'll be running in passthrough mode until you attach it to a cache. See the | ||||
| section on attaching. | ||||
| 
 | ||||
| The devices show up at /dev/bcacheN, and can be controlled via sysfs from | ||||
| /sys/block/bcacheN/bcache: | ||||
| 
 | ||||
|   mkfs.ext4 /dev/bcache0 | ||||
|   mount /dev/bcache0 /mnt | ||||
| 
 | ||||
| Cache devices are managed as sets; multiple caches per set isn't supported yet | ||||
| but will allow for mirroring of metadata and dirty data in the future. Your new | ||||
| cache set shows up as /sys/fs/bcache/<UUID> | ||||
| 
 | ||||
| ATTACHING: | ||||
| 
 | ||||
| After your cache device and backing device are registered, the backing device | ||||
| must be attached to your cache set to enable caching. Attaching a backing | ||||
| device to a cache set is done thusly, with the UUID of the cache set in | ||||
| /sys/fs/bcache: | ||||
| 
 | ||||
|   echo <UUID> > /sys/block/bcache0/bcache/attach | ||||
| 
 | ||||
| This only has to be done once. The next time you reboot, just reregister all | ||||
| your bcache devices. If a backing device has data in a cache somewhere, the | ||||
| /dev/bcache# device won't be created until the cache shows up - particularly | ||||
| important if you have writeback caching turned on. | ||||
| 
 | ||||
| If you're booting up and your cache device is gone and never coming back, you | ||||
| can force run the backing device: | ||||
| 
 | ||||
|   echo 1 > /sys/block/sdb/bcache/running | ||||
| 
 | ||||
| (You need to use /sys/block/sdb (or whatever your backing device is called), not | ||||
| /sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a | ||||
| partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache) | ||||
| 
 | ||||
| The backing device will still use that cache set if it shows up in the future, | ||||
| but all the cached data will be invalidated. If there was dirty data in the | ||||
| cache, don't expect the filesystem to be recoverable - you will have massive | ||||
| filesystem corruption, though ext4's fsck does work miracles. | ||||
| 
 | ||||
| SYSFS - BACKING DEVICE: | ||||
| 
 | ||||
| attach | ||||
|   Echo the UUID of a cache set to this file to enable caching. | ||||
| 
 | ||||
| cache_mode | ||||
|   Can be one of either writethrough, writeback, writearound or none. | ||||
| 
 | ||||
| clear_stats | ||||
|   Writing to this file resets the running total stats (not the day/hour/5 minute | ||||
|   decaying versions). | ||||
| 
 | ||||
| detach | ||||
|   Write to this file to detach from a cache set. If there is dirty data in the | ||||
|   cache, it will be flushed first. | ||||
| 
 | ||||
| dirty_data | ||||
|   Amount of dirty data for this backing device in the cache. Continuously | ||||
|   updated unlike the cache set's version, but may be slightly off. | ||||
| 
 | ||||
| label | ||||
|   Name of underlying device. | ||||
| 
 | ||||
| readahead | ||||
|   Size of readahead that should be performed.  Defaults to 0.  If set to e.g. | ||||
|   1M, it will round cache miss reads up to that size, but without overlapping | ||||
|   existing cache entries. | ||||
| 
 | ||||
| running | ||||
|   1 if bcache is running (i.e. whether the /dev/bcache device exists, whether | ||||
|   it's in passthrough mode or caching). | ||||
| 
 | ||||
| sequential_cutoff | ||||
|   A sequential IO will bypass the cache once it passes this threshhold; the | ||||
|   most recent 128 IOs are tracked so sequential IO can be detected even when | ||||
|   it isn't all done at once. | ||||
| 
 | ||||
| sequential_merge | ||||
|   If non zero, bcache keeps a list of the last 128 requests submitted to compare | ||||
|   against all new requests to determine which new requests are sequential | ||||
|   continuations of previous requests for the purpose of determining sequential | ||||
|   cutoff. This is necessary if the sequential cutoff value is greater than the | ||||
|   maximum acceptable sequential size for any single request.  | ||||
| 
 | ||||
| state | ||||
|   The backing device can be in one of four different states: | ||||
| 
 | ||||
|   no cache: Has never been attached to a cache set. | ||||
| 
 | ||||
|   clean: Part of a cache set, and there is no cached dirty data. | ||||
| 
 | ||||
|   dirty: Part of a cache set, and there is cached dirty data. | ||||
| 
 | ||||
|   inconsistent: The backing device was forcibly run by the user when there was | ||||
|   dirty data cached but the cache set was unavailable; whatever data was on the | ||||
|   backing device has likely been corrupted. | ||||
| 
 | ||||
| stop | ||||
|   Write to this file to shut down the bcache device and close the backing | ||||
|   device. | ||||
| 
 | ||||
| writeback_delay | ||||
|   When dirty data is written to the cache and it previously did not contain | ||||
|   any, waits some number of seconds before initiating writeback. Defaults to | ||||
|   30. | ||||
| 
 | ||||
| writeback_percent | ||||
|   If nonzero, bcache tries to keep around this percentage of the cache dirty by | ||||
|   throttling background writeback and using a PD controller to smoothly adjust | ||||
|   the rate. | ||||
| 
 | ||||
| writeback_rate | ||||
|   Rate in sectors per second - if writeback_percent is nonzero, background | ||||
|   writeback is throttled to this rate. Continuously adjusted by bcache but may | ||||
|   also be set by the user. | ||||
| 
 | ||||
| writeback_running | ||||
|   If off, writeback of dirty data will not take place at all. Dirty data will | ||||
|   still be added to the cache until it is mostly full; only meant for | ||||
|   benchmarking. Defaults to on. | ||||
| 
 | ||||
| SYSFS - BACKING DEVICE STATS: | ||||
| 
 | ||||
| There are directories with these numbers for a running total, as well as | ||||
| versions that decay over the past day, hour and 5 minutes; they're also | ||||
| aggregated in the cache set directory as well. | ||||
| 
 | ||||
| bypassed | ||||
|   Amount of IO (both reads and writes) that has bypassed the cache | ||||
| 
 | ||||
| cache_hits | ||||
| cache_misses | ||||
| cache_hit_ratio | ||||
|   Hits and misses are counted per individual IO as bcache sees them; a | ||||
|   partial hit is counted as a miss. | ||||
| 
 | ||||
| cache_bypass_hits | ||||
| cache_bypass_misses | ||||
|   Hits and misses for IO that is intended to skip the cache are still counted, | ||||
|   but broken out here. | ||||
| 
 | ||||
| cache_miss_collisions | ||||
|   Counts instances where data was going to be inserted into the cache from a | ||||
|   cache miss, but raced with a write and data was already present (usually 0 | ||||
|   since the synchronization for cache misses was rewritten) | ||||
| 
 | ||||
| cache_readaheads | ||||
|   Count of times readahead occured. | ||||
| 
 | ||||
| SYSFS - CACHE SET: | ||||
| 
 | ||||
| average_key_size | ||||
|   Average data per key in the btree. | ||||
| 
 | ||||
| bdev<0..n> | ||||
|   Symlink to each of the attached backing devices. | ||||
| 
 | ||||
| block_size | ||||
|   Block size of the cache devices. | ||||
| 
 | ||||
| btree_cache_size | ||||
|   Amount of memory currently used by the btree cache | ||||
| 
 | ||||
| bucket_size | ||||
|   Size of buckets | ||||
| 
 | ||||
| cache<0..n> | ||||
|   Symlink to each of the cache devices comprising this cache set.  | ||||
| 
 | ||||
| cache_available_percent | ||||
|   Percentage of cache device free. | ||||
| 
 | ||||
| clear_stats | ||||
|   Clears the statistics associated with this cache | ||||
| 
 | ||||
| dirty_data | ||||
|   Amount of dirty data is in the cache (updated when garbage collection runs). | ||||
| 
 | ||||
| flash_vol_create | ||||
|   Echoing a size to this file (in human readable units, k/M/G) creates a thinly | ||||
|   provisioned volume backed by the cache set. | ||||
| 
 | ||||
| io_error_halflife | ||||
| io_error_limit | ||||
|   These determines how many errors we accept before disabling the cache. | ||||
|   Each error is decayed by the half life (in # ios).  If the decaying count | ||||
|   reaches io_error_limit dirty data is written out and the cache is disabled. | ||||
| 
 | ||||
| journal_delay_ms | ||||
|   Journal writes will delay for up to this many milliseconds, unless a cache | ||||
|   flush happens sooner. Defaults to 100. | ||||
| 
 | ||||
| root_usage_percent | ||||
|   Percentage of the root btree node in use.  If this gets too high the node | ||||
|   will split, increasing the tree depth. | ||||
| 
 | ||||
| stop | ||||
|   Write to this file to shut down the cache set - waits until all attached | ||||
|   backing devices have been shut down. | ||||
| 
 | ||||
| tree_depth | ||||
|   Depth of the btree (A single node btree has depth 0). | ||||
| 
 | ||||
| unregister | ||||
|   Detaches all backing devices and closes the cache devices; if dirty data is | ||||
|   present it will disable writeback caching and wait for it to be flushed. | ||||
| 
 | ||||
| SYSFS - CACHE SET INTERNAL: | ||||
| 
 | ||||
| This directory also exposes timings for a number of internal operations, with | ||||
| separate files for average duration, average frequency, last occurence and max | ||||
| duration: garbage collection, btree read, btree node sorts and btree splits. | ||||
| 
 | ||||
| active_journal_entries | ||||
|   Number of journal entries that are newer than the index. | ||||
| 
 | ||||
| btree_nodes | ||||
|   Total nodes in the btree. | ||||
| 
 | ||||
| btree_used_percent | ||||
|   Average fraction of btree in use. | ||||
| 
 | ||||
| bset_tree_stats | ||||
|   Statistics about the auxiliary search trees | ||||
| 
 | ||||
| btree_cache_max_chain | ||||
|   Longest chain in the btree node cache's hash table | ||||
| 
 | ||||
| cache_read_races | ||||
|   Counts instances where while data was being read from the cache, the bucket | ||||
|   was reused and invalidated - i.e. where the pointer was stale after the read | ||||
|   completed. When this occurs the data is reread from the backing device. | ||||
| 
 | ||||
| trigger_gc | ||||
|   Writing to this file forces garbage collection to run. | ||||
| 
 | ||||
| SYSFS - CACHE DEVICE: | ||||
| 
 | ||||
| block_size | ||||
|   Minimum granularity of writes - should match hardware sector size. | ||||
| 
 | ||||
| btree_written | ||||
|   Sum of all btree writes, in (kilo/mega/giga) bytes | ||||
| 
 | ||||
| bucket_size | ||||
|   Size of buckets | ||||
| 
 | ||||
| cache_replacement_policy | ||||
|   One of either lru, fifo or random. | ||||
| 
 | ||||
| discard | ||||
|   Boolean; if on a discard/TRIM will be issued to each bucket before it is | ||||
|   reused. Defaults to off, since SATA TRIM is an unqueued command (and thus | ||||
|   slow). | ||||
| 
 | ||||
| freelist_percent | ||||
|   Size of the freelist as a percentage of nbuckets. Can be written to to | ||||
|   increase the number of buckets kept on the freelist, which lets you | ||||
|   artificially reduce the size of the cache at runtime. Mostly for testing | ||||
|   purposes (i.e. testing how different size caches affect your hit rate), but | ||||
|   since buckets are discarded when they move on to the freelist will also make | ||||
|   the SSD's garbage collection easier by effectively giving it more reserved | ||||
|   space. | ||||
| 
 | ||||
| io_errors | ||||
|   Number of errors that have occured, decayed by io_error_halflife. | ||||
| 
 | ||||
| metadata_written | ||||
|   Sum of all non data writes (btree writes and all other metadata). | ||||
| 
 | ||||
| nbuckets | ||||
|   Total buckets in this cache | ||||
| 
 | ||||
| priority_stats | ||||
|   Statistics about how recently data in the cache has been accessed.  This can | ||||
|   reveal your working set size. | ||||
| 
 | ||||
| written | ||||
|   Sum of all data that has been written to the cache; comparison with | ||||
|   btree_written gives the amount of write inflation in bcache. | ||||
|  | @ -1616,6 +1616,13 @@ W:	http://www.baycom.org/~tom/ham/ham.html | |||
| S:	Maintained | ||||
| F:	drivers/net/hamradio/baycom* | ||||
| 
 | ||||
| BCACHE (BLOCK LAYER CACHE) | ||||
| M:	Kent Overstreet <koverstreet@google.com> | ||||
| L:	linux-bcache@vger.kernel.org | ||||
| W:	http://bcache.evilpiepirate.org | ||||
| S:	Maintained: | ||||
| F:	drivers/md/bcache/ | ||||
| 
 | ||||
| BEFS FILE SYSTEM | ||||
| S:	Orphan | ||||
| F:	Documentation/filesystems/befs.txt | ||||
|  |  | |||
|  | @ -174,6 +174,8 @@ config MD_FAULTY | |||
| 
 | ||||
| 	  In unsure, say N. | ||||
| 
 | ||||
| source "drivers/md/bcache/Kconfig" | ||||
| 
 | ||||
| config BLK_DEV_DM | ||||
| 	tristate "Device mapper support" | ||||
| 	---help--- | ||||
|  |  | |||
|  | @ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o | |||
| obj-$(CONFIG_MD_RAID456)	+= raid456.o | ||||
| obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o | ||||
| obj-$(CONFIG_MD_FAULTY)		+= faulty.o | ||||
| obj-$(CONFIG_BCACHE)		+= bcache/ | ||||
| obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o | ||||
| obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o | ||||
| obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o | ||||
|  |  | |||
							
								
								
									
										42
									
								
								drivers/md/bcache/Kconfig
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								drivers/md/bcache/Kconfig
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | |||
| 
 | ||||
| config BCACHE | ||||
| 	tristate "Block device as cache" | ||||
| 	select CLOSURES | ||||
| 	---help--- | ||||
| 	Allows a block device to be used as cache for other devices; uses | ||||
| 	a btree for indexing and the layout is optimized for SSDs. | ||||
| 
 | ||||
| 	See Documentation/bcache.txt for details. | ||||
| 
 | ||||
| config BCACHE_DEBUG | ||||
| 	bool "Bcache debugging" | ||||
| 	depends on BCACHE | ||||
| 	---help--- | ||||
| 	Don't select this option unless you're a developer | ||||
| 
 | ||||
| 	Enables extra debugging tools (primarily a fuzz tester) | ||||
| 
 | ||||
| config BCACHE_EDEBUG | ||||
| 	bool "Extended runtime checks" | ||||
| 	depends on BCACHE | ||||
| 	---help--- | ||||
| 	Don't select this option unless you're a developer | ||||
| 
 | ||||
| 	Enables extra runtime checks which significantly affect performance | ||||
| 
 | ||||
| config BCACHE_CLOSURES_DEBUG | ||||
| 	bool "Debug closures" | ||||
| 	depends on BCACHE | ||||
| 	select DEBUG_FS | ||||
| 	---help--- | ||||
| 	Keeps all active closures in a linked list and provides a debugfs | ||||
| 	interface to list them, which makes it possible to see asynchronous | ||||
| 	operations that get stuck. | ||||
| 
 | ||||
| # cgroup code needs to be updated: | ||||
| # | ||||
| #config CGROUP_BCACHE | ||||
| #	bool "Cgroup controls for bcache" | ||||
| #	depends on BCACHE && BLK_CGROUP | ||||
| #	---help--- | ||||
| #	TODO | ||||
							
								
								
									
										7
									
								
								drivers/md/bcache/Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								drivers/md/bcache/Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,7 @@ | |||
| 
 | ||||
| obj-$(CONFIG_BCACHE)	+= bcache.o | ||||
| 
 | ||||
| bcache-y		:= alloc.o btree.o bset.o io.o journal.o writeback.o\
 | ||||
| 	movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | ||||
| 
 | ||||
| CFLAGS_request.o	+= -Iblock | ||||
							
								
								
									
										583
									
								
								drivers/md/bcache/alloc.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										583
									
								
								drivers/md/bcache/alloc.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,583 @@ | |||
| /*
 | ||||
|  * Primary bucket allocation code | ||||
|  * | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  * | ||||
|  * Allocation in bcache is done in terms of buckets: | ||||
|  * | ||||
|  * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in | ||||
|  * btree pointers - they must match for the pointer to be considered valid. | ||||
|  * | ||||
|  * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a | ||||
|  * bucket simply by incrementing its gen. | ||||
|  * | ||||
|  * The gens (along with the priorities; it's really the gens are important but | ||||
|  * the code is named as if it's the priorities) are written in an arbitrary list | ||||
|  * of buckets on disk, with a pointer to them in the journal header. | ||||
|  * | ||||
|  * When we invalidate a bucket, we have to write its new gen to disk and wait | ||||
|  * for that write to complete before we use it - otherwise after a crash we | ||||
|  * could have pointers that appeared to be good but pointed to data that had | ||||
|  * been overwritten. | ||||
|  * | ||||
|  * Since the gens and priorities are all stored contiguously on disk, we can | ||||
|  * batch this up: We fill up the free_inc list with freshly invalidated buckets, | ||||
|  * call prio_write(), and when prio_write() finishes we pull buckets off the | ||||
|  * free_inc list and optionally discard them. | ||||
|  * | ||||
|  * free_inc isn't the only freelist - if it was, we'd often to sleep while | ||||
|  * priorities and gens were being written before we could allocate. c->free is a | ||||
|  * smaller freelist, and buckets on that list are always ready to be used. | ||||
|  * | ||||
|  * If we've got discards enabled, that happens when a bucket moves from the | ||||
|  * free_inc list to the free list. | ||||
|  * | ||||
|  * There is another freelist, because sometimes we have buckets that we know | ||||
|  * have nothing pointing into them - these we can reuse without waiting for | ||||
|  * priorities to be rewritten. These come from freed btree nodes and buckets | ||||
|  * that garbage collection discovered no longer had valid keys pointing into | ||||
|  * them (because they were overwritten). That's the unused list - buckets on the | ||||
|  * unused list move to the free list, optionally being discarded in the process. | ||||
|  * | ||||
|  * It's also important to ensure that gens don't wrap around - with respect to | ||||
|  * either the oldest gen in the btree or the gen on disk. This is quite | ||||
|  * difficult to do in practice, but we explicitly guard against it anyways - if | ||||
|  * a bucket is in danger of wrapping around we simply skip invalidating it that | ||||
|  * time around, and we garbage collect or rewrite the priorities sooner than we | ||||
|  * would have otherwise. | ||||
|  * | ||||
|  * bch_bucket_alloc() allocates a single bucket from a specific cache. | ||||
|  * | ||||
|  * bch_bucket_alloc_set() allocates one or more buckets from different caches | ||||
|  * out of a cache set. | ||||
|  * | ||||
|  * free_some_buckets() drives all the processes described above. It's called | ||||
|  * from bch_bucket_alloc() and a few other places that need to make sure free | ||||
|  * buckets are ready. | ||||
|  * | ||||
|  * invalidate_buckets_(lru|fifo)() find buckets that are available to be | ||||
|  * invalidated, and then invalidate them and stick them on the free_inc list - | ||||
|  * in either lru or fifo order. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| 
 | ||||
| #include <linux/random.h> | ||||
| 
 | ||||
| #define MAX_IN_FLIGHT_DISCARDS		8U | ||||
| 
 | ||||
| /* Bucket heap / gen */ | ||||
| 
 | ||||
| uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | ||||
| { | ||||
| 	uint8_t ret = ++b->gen; | ||||
| 
 | ||||
| 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); | ||||
| 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); | ||||
| 
 | ||||
| 	if (CACHE_SYNC(&ca->set->sb)) { | ||||
| 		ca->need_save_prio = max(ca->need_save_prio, | ||||
| 					 bucket_disk_gen(b)); | ||||
| 		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void bch_rescale_priorities(struct cache_set *c, int sectors) | ||||
| { | ||||
| 	struct cache *ca; | ||||
| 	struct bucket *b; | ||||
| 	unsigned next = c->nbuckets * c->sb.bucket_size / 1024; | ||||
| 	unsigned i; | ||||
| 	int r; | ||||
| 
 | ||||
| 	atomic_sub(sectors, &c->rescale); | ||||
| 
 | ||||
| 	do { | ||||
| 		r = atomic_read(&c->rescale); | ||||
| 
 | ||||
| 		if (r >= 0) | ||||
| 			return; | ||||
| 	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r); | ||||
| 
 | ||||
| 	mutex_lock(&c->bucket_lock); | ||||
| 
 | ||||
| 	c->min_prio = USHRT_MAX; | ||||
| 
 | ||||
| 	for_each_cache(ca, c, i) | ||||
| 		for_each_bucket(b, ca) | ||||
| 			if (b->prio && | ||||
| 			    b->prio != BTREE_PRIO && | ||||
| 			    !atomic_read(&b->pin)) { | ||||
| 				b->prio--; | ||||
| 				c->min_prio = min(c->min_prio, b->prio); | ||||
| 			} | ||||
| 
 | ||||
| 	mutex_unlock(&c->bucket_lock); | ||||
| } | ||||
| 
 | ||||
| /* Discard/TRIM */ | ||||
| 
 | ||||
| struct discard { | ||||
| 	struct list_head	list; | ||||
| 	struct work_struct	work; | ||||
| 	struct cache		*ca; | ||||
| 	long			bucket; | ||||
| 
 | ||||
| 	struct bio		bio; | ||||
| 	struct bio_vec		bv; | ||||
| }; | ||||
| 
 | ||||
| static void discard_finish(struct work_struct *w) | ||||
| { | ||||
| 	struct discard *d = container_of(w, struct discard, work); | ||||
| 	struct cache *ca = d->ca; | ||||
| 	char buf[BDEVNAME_SIZE]; | ||||
| 
 | ||||
| 	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { | ||||
| 		pr_notice("discard error on %s, disabling", | ||||
| 			 bdevname(ca->bdev, buf)); | ||||
| 		d->ca->discard = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_lock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 	fifo_push(&ca->free, d->bucket); | ||||
| 	list_add(&d->list, &ca->discards); | ||||
| 	atomic_dec(&ca->discards_in_flight); | ||||
| 
 | ||||
| 	mutex_unlock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 	closure_wake_up(&ca->set->bucket_wait); | ||||
| 	wake_up(&ca->set->alloc_wait); | ||||
| 
 | ||||
| 	closure_put(&ca->set->cl); | ||||
| } | ||||
| 
 | ||||
| static void discard_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct discard *d = container_of(bio, struct discard, bio); | ||||
| 	schedule_work(&d->work); | ||||
| } | ||||
| 
 | ||||
| static void do_discard(struct cache *ca, long bucket) | ||||
| { | ||||
| 	struct discard *d = list_first_entry(&ca->discards, | ||||
| 					     struct discard, list); | ||||
| 
 | ||||
| 	list_del(&d->list); | ||||
| 	d->bucket = bucket; | ||||
| 
 | ||||
| 	atomic_inc(&ca->discards_in_flight); | ||||
| 	closure_get(&ca->set->cl); | ||||
| 
 | ||||
| 	bio_init(&d->bio); | ||||
| 
 | ||||
| 	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket); | ||||
| 	d->bio.bi_bdev		= ca->bdev; | ||||
| 	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD; | ||||
| 	d->bio.bi_max_vecs	= 1; | ||||
| 	d->bio.bi_io_vec	= d->bio.bi_inline_vecs; | ||||
| 	d->bio.bi_size		= bucket_bytes(ca); | ||||
| 	d->bio.bi_end_io	= discard_endio; | ||||
| 	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||||
| 
 | ||||
| 	submit_bio(0, &d->bio); | ||||
| } | ||||
| 
 | ||||
| /* Allocation */ | ||||
| 
 | ||||
| static inline bool can_inc_bucket_gen(struct bucket *b) | ||||
| { | ||||
| 	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && | ||||
| 		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; | ||||
| } | ||||
| 
 | ||||
| bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | ||||
| { | ||||
| 	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | ||||
| 
 | ||||
| 	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | ||||
| 	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | ||||
| 		return false; | ||||
| 
 | ||||
| 	b->prio = 0; | ||||
| 
 | ||||
| 	if (can_inc_bucket_gen(b) && | ||||
| 	    fifo_push(&ca->unused, b - ca->buckets)) { | ||||
| 		atomic_inc(&b->pin); | ||||
| 		return true; | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) | ||||
| { | ||||
| 	return GC_MARK(b) == GC_MARK_RECLAIMABLE && | ||||
| 		!atomic_read(&b->pin) && | ||||
| 		can_inc_bucket_gen(b); | ||||
| } | ||||
| 
 | ||||
| static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | ||||
| { | ||||
| 	bch_inc_gen(ca, b); | ||||
| 	b->prio = INITIAL_PRIO; | ||||
| 	atomic_inc(&b->pin); | ||||
| 	fifo_push(&ca->free_inc, b - ca->buckets); | ||||
| } | ||||
| 
 | ||||
| static void invalidate_buckets_lru(struct cache *ca) | ||||
| { | ||||
| 	unsigned bucket_prio(struct bucket *b) | ||||
| 	{ | ||||
| 		return ((unsigned) (b->prio - ca->set->min_prio)) * | ||||
| 			GC_SECTORS_USED(b); | ||||
| 	} | ||||
| 
 | ||||
| 	bool bucket_max_cmp(struct bucket *l, struct bucket *r) | ||||
| 	{ | ||||
| 		return bucket_prio(l) < bucket_prio(r); | ||||
| 	} | ||||
| 
 | ||||
| 	bool bucket_min_cmp(struct bucket *l, struct bucket *r) | ||||
| 	{ | ||||
| 		return bucket_prio(l) > bucket_prio(r); | ||||
| 	} | ||||
| 
 | ||||
| 	struct bucket *b; | ||||
| 	ssize_t i; | ||||
| 
 | ||||
| 	ca->heap.used = 0; | ||||
| 
 | ||||
| 	for_each_bucket(b, ca) { | ||||
| 		if (!can_invalidate_bucket(ca, b)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (!GC_SECTORS_USED(b)) { | ||||
| 			if (!bch_bucket_add_unused(ca, b)) | ||||
| 				return; | ||||
| 		} else { | ||||
| 			if (!heap_full(&ca->heap)) | ||||
| 				heap_add(&ca->heap, b, bucket_max_cmp); | ||||
| 			else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { | ||||
| 				ca->heap.data[0] = b; | ||||
| 				heap_sift(&ca->heap, 0, bucket_max_cmp); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (ca->heap.used * 2 < ca->heap.size) | ||||
| 		bch_queue_gc(ca->set); | ||||
| 
 | ||||
| 	for (i = ca->heap.used / 2 - 1; i >= 0; --i) | ||||
| 		heap_sift(&ca->heap, i, bucket_min_cmp); | ||||
| 
 | ||||
| 	while (!fifo_full(&ca->free_inc)) { | ||||
| 		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { | ||||
| 			/* We don't want to be calling invalidate_buckets()
 | ||||
| 			 * multiple times when it can't do anything | ||||
| 			 */ | ||||
| 			ca->invalidate_needs_gc = 1; | ||||
| 			bch_queue_gc(ca->set); | ||||
| 			return; | ||||
| 		} | ||||
| 
 | ||||
| 		invalidate_one_bucket(ca, b); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void invalidate_buckets_fifo(struct cache *ca) | ||||
| { | ||||
| 	struct bucket *b; | ||||
| 	size_t checked = 0; | ||||
| 
 | ||||
| 	while (!fifo_full(&ca->free_inc)) { | ||||
| 		if (ca->fifo_last_bucket <  ca->sb.first_bucket || | ||||
| 		    ca->fifo_last_bucket >= ca->sb.nbuckets) | ||||
| 			ca->fifo_last_bucket = ca->sb.first_bucket; | ||||
| 
 | ||||
| 		b = ca->buckets + ca->fifo_last_bucket++; | ||||
| 
 | ||||
| 		if (can_invalidate_bucket(ca, b)) | ||||
| 			invalidate_one_bucket(ca, b); | ||||
| 
 | ||||
| 		if (++checked >= ca->sb.nbuckets) { | ||||
| 			ca->invalidate_needs_gc = 1; | ||||
| 			bch_queue_gc(ca->set); | ||||
| 			return; | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void invalidate_buckets_random(struct cache *ca) | ||||
| { | ||||
| 	struct bucket *b; | ||||
| 	size_t checked = 0; | ||||
| 
 | ||||
| 	while (!fifo_full(&ca->free_inc)) { | ||||
| 		size_t n; | ||||
| 		get_random_bytes(&n, sizeof(n)); | ||||
| 
 | ||||
| 		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); | ||||
| 		n += ca->sb.first_bucket; | ||||
| 
 | ||||
| 		b = ca->buckets + n; | ||||
| 
 | ||||
| 		if (can_invalidate_bucket(ca, b)) | ||||
| 			invalidate_one_bucket(ca, b); | ||||
| 
 | ||||
| 		if (++checked >= ca->sb.nbuckets / 2) { | ||||
| 			ca->invalidate_needs_gc = 1; | ||||
| 			bch_queue_gc(ca->set); | ||||
| 			return; | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void invalidate_buckets(struct cache *ca) | ||||
| { | ||||
| 	if (ca->invalidate_needs_gc) | ||||
| 		return; | ||||
| 
 | ||||
| 	switch (CACHE_REPLACEMENT(&ca->sb)) { | ||||
| 	case CACHE_REPLACEMENT_LRU: | ||||
| 		invalidate_buckets_lru(ca); | ||||
| 		break; | ||||
| 	case CACHE_REPLACEMENT_FIFO: | ||||
| 		invalidate_buckets_fifo(ca); | ||||
| 		break; | ||||
| 	case CACHE_REPLACEMENT_RANDOM: | ||||
| 		invalidate_buckets_random(ca); | ||||
| 		break; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #define allocator_wait(ca, cond)					\ | ||||
| do {									\ | ||||
| 	DEFINE_WAIT(__wait);						\ | ||||
| 									\ | ||||
| 	while (!(cond)) {						\ | ||||
| 		prepare_to_wait(&ca->set->alloc_wait,			\ | ||||
| 				&__wait, TASK_INTERRUPTIBLE);		\ | ||||
| 									\ | ||||
| 		mutex_unlock(&(ca)->set->bucket_lock);			\ | ||||
| 		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\ | ||||
| 			finish_wait(&ca->set->alloc_wait, &__wait);	\ | ||||
| 			closure_return(cl);				\ | ||||
| 		}							\ | ||||
| 									\ | ||||
| 		schedule();						\ | ||||
| 		__set_current_state(TASK_RUNNING);			\ | ||||
| 		mutex_lock(&(ca)->set->bucket_lock);			\ | ||||
| 	}								\ | ||||
| 									\ | ||||
| 	finish_wait(&ca->set->alloc_wait, &__wait);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| void bch_allocator_thread(struct closure *cl) | ||||
| { | ||||
| 	struct cache *ca = container_of(cl, struct cache, alloc); | ||||
| 
 | ||||
| 	mutex_lock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		while (1) { | ||||
| 			long bucket; | ||||
| 
 | ||||
| 			if ((!atomic_read(&ca->set->prio_blocked) || | ||||
| 			     !CACHE_SYNC(&ca->set->sb)) && | ||||
| 			    !fifo_empty(&ca->unused)) | ||||
| 				fifo_pop(&ca->unused, bucket); | ||||
| 			else if (!fifo_empty(&ca->free_inc)) | ||||
| 				fifo_pop(&ca->free_inc, bucket); | ||||
| 			else | ||||
| 				break; | ||||
| 
 | ||||
| 			allocator_wait(ca, (int) fifo_free(&ca->free) > | ||||
| 				       atomic_read(&ca->discards_in_flight)); | ||||
| 
 | ||||
| 			if (ca->discard) { | ||||
| 				allocator_wait(ca, !list_empty(&ca->discards)); | ||||
| 				do_discard(ca, bucket); | ||||
| 			} else { | ||||
| 				fifo_push(&ca->free, bucket); | ||||
| 				closure_wake_up(&ca->set->bucket_wait); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		allocator_wait(ca, ca->set->gc_mark_valid); | ||||
| 		invalidate_buckets(ca); | ||||
| 
 | ||||
| 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) || | ||||
| 			       !CACHE_SYNC(&ca->set->sb)); | ||||
| 
 | ||||
| 		if (CACHE_SYNC(&ca->set->sb) && | ||||
| 		    (!fifo_empty(&ca->free_inc) || | ||||
| 		     ca->need_save_prio > 64)) { | ||||
| 			bch_prio_write(ca); | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | ||||
| { | ||||
| 	long r = -1; | ||||
| again: | ||||
| 	wake_up(&ca->set->alloc_wait); | ||||
| 
 | ||||
| 	if (fifo_used(&ca->free) > ca->watermark[watermark] && | ||||
| 	    fifo_pop(&ca->free, r)) { | ||||
| 		struct bucket *b = ca->buckets + r; | ||||
| #ifdef CONFIG_BCACHE_EDEBUG | ||||
| 		size_t iter; | ||||
| 		long i; | ||||
| 
 | ||||
| 		for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | ||||
| 			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | ||||
| 
 | ||||
| 		fifo_for_each(i, &ca->free, iter) | ||||
| 			BUG_ON(i == r); | ||||
| 		fifo_for_each(i, &ca->free_inc, iter) | ||||
| 			BUG_ON(i == r); | ||||
| 		fifo_for_each(i, &ca->unused, iter) | ||||
| 			BUG_ON(i == r); | ||||
| #endif | ||||
| 		BUG_ON(atomic_read(&b->pin) != 1); | ||||
| 
 | ||||
| 		SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | ||||
| 
 | ||||
| 		if (watermark <= WATERMARK_METADATA) { | ||||
| 			SET_GC_MARK(b, GC_MARK_METADATA); | ||||
| 			b->prio = BTREE_PRIO; | ||||
| 		} else { | ||||
| 			SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||||
| 			b->prio = INITIAL_PRIO; | ||||
| 		} | ||||
| 
 | ||||
| 		return r; | ||||
| 	} | ||||
| 
 | ||||
| 	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", | ||||
| 		 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), | ||||
| 		 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||||
| 
 | ||||
| 	if (cl) { | ||||
| 		closure_wait(&ca->set->bucket_wait, cl); | ||||
| 
 | ||||
| 		if (closure_blocking(cl)) { | ||||
| 			mutex_unlock(&ca->set->bucket_lock); | ||||
| 			closure_sync(cl); | ||||
| 			mutex_lock(&ca->set->bucket_lock); | ||||
| 			goto again; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| void bch_bucket_free(struct cache_set *c, struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		struct bucket *b = PTR_BUCKET(c, k, i); | ||||
| 
 | ||||
| 		SET_GC_MARK(b, 0); | ||||
| 		SET_GC_SECTORS_USED(b, 0); | ||||
| 		bch_bucket_add_unused(PTR_CACHE(c, k, i), b); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||||
| 			   struct bkey *k, int n, struct closure *cl) | ||||
| { | ||||
| 	int i; | ||||
| 
 | ||||
| 	lockdep_assert_held(&c->bucket_lock); | ||||
| 	BUG_ON(!n || n > c->caches_loaded || n > 8); | ||||
| 
 | ||||
| 	bkey_init(k); | ||||
| 
 | ||||
| 	/* sort by free space/prio of oldest data in caches */ | ||||
| 
 | ||||
| 	for (i = 0; i < n; i++) { | ||||
| 		struct cache *ca = c->cache_by_alloc[i]; | ||||
| 		long b = bch_bucket_alloc(ca, watermark, cl); | ||||
| 
 | ||||
| 		if (b == -1) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		k->ptr[i] = PTR(ca->buckets[b].gen, | ||||
| 				bucket_to_sector(c, b), | ||||
| 				ca->sb.nr_this_dev); | ||||
| 
 | ||||
| 		SET_KEY_PTRS(k, i + 1); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| err: | ||||
| 	bch_bucket_free(c, k); | ||||
| 	__bkey_put(c, k); | ||||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||||
| 			 struct bkey *k, int n, struct closure *cl) | ||||
| { | ||||
| 	int ret; | ||||
| 	mutex_lock(&c->bucket_lock); | ||||
| 	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); | ||||
| 	mutex_unlock(&c->bucket_lock); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /* Init */ | ||||
| 
 | ||||
| void bch_cache_allocator_exit(struct cache *ca) | ||||
| { | ||||
| 	struct discard *d; | ||||
| 
 | ||||
| 	while (!list_empty(&ca->discards)) { | ||||
| 		d = list_first_entry(&ca->discards, struct discard, list); | ||||
| 		cancel_work_sync(&d->work); | ||||
| 		list_del(&d->list); | ||||
| 		kfree(d); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int bch_cache_allocator_init(struct cache *ca) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Reserve: | ||||
| 	 * Prio/gen writes first | ||||
| 	 * Then 8 for btree allocations | ||||
| 	 * Then half for the moving garbage collector | ||||
| 	 */ | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_PRIO] = 0; | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_MOVINGGC] = 8 + | ||||
| 		ca->watermark[WATERMARK_METADATA]; | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||||
| 		ca->watermark[WATERMARK_MOVINGGC]; | ||||
| 
 | ||||
| 	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { | ||||
| 		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); | ||||
| 		if (!d) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		d->ca = ca; | ||||
| 		INIT_WORK(&d->work, discard_finish); | ||||
| 		list_add(&d->list, &ca->discards); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
							
								
								
									
										1232
									
								
								drivers/md/bcache/bcache.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1232
									
								
								drivers/md/bcache/bcache.h
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1190
									
								
								drivers/md/bcache/bset.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1190
									
								
								drivers/md/bcache/bset.c
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										379
									
								
								drivers/md/bcache/bset.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										379
									
								
								drivers/md/bcache/bset.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,379 @@ | |||
| #ifndef _BCACHE_BSET_H | ||||
| #define _BCACHE_BSET_H | ||||
| 
 | ||||
| /*
 | ||||
|  * BKEYS: | ||||
|  * | ||||
|  * A bkey contains a key, a size field, a variable number of pointers, and some | ||||
|  * ancillary flag bits. | ||||
|  * | ||||
|  * We use two different functions for validating bkeys, bch_ptr_invalid and | ||||
|  * bch_ptr_bad(). | ||||
|  * | ||||
|  * bch_ptr_invalid() primarily filters out keys and pointers that would be | ||||
|  * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and | ||||
|  * pointer that occur in normal practice but don't point to real data. | ||||
|  * | ||||
|  * The one exception to the rule that ptr_invalid() filters out invalid keys is | ||||
|  * that it also filters out keys of size 0 - these are keys that have been | ||||
|  * completely overwritten. It'd be safe to delete these in memory while leaving | ||||
|  * them on disk, just unnecessary work - so we filter them out when resorting | ||||
|  * instead. | ||||
|  * | ||||
|  * We can't filter out stale keys when we're resorting, because garbage | ||||
|  * collection needs to find them to ensure bucket gens don't wrap around - | ||||
|  * unless we're rewriting the btree node those stale keys still exist on disk. | ||||
|  * | ||||
|  * We also implement functions here for removing some number of sectors from the | ||||
|  * front or the back of a bkey - this is mainly used for fixing overlapping | ||||
|  * extents, by removing the overlapping sectors from the older key. | ||||
|  * | ||||
|  * BSETS: | ||||
|  * | ||||
|  * A bset is an array of bkeys laid out contiguously in memory in sorted order, | ||||
|  * along with a header. A btree node is made up of a number of these, written at | ||||
|  * different times. | ||||
|  * | ||||
|  * There could be many of them on disk, but we never allow there to be more than | ||||
|  * 4 in memory - we lazily resort as needed. | ||||
|  * | ||||
|  * We implement code here for creating and maintaining auxiliary search trees | ||||
|  * (described below) for searching an individial bset, and on top of that we | ||||
|  * implement a btree iterator. | ||||
|  * | ||||
|  * BTREE ITERATOR: | ||||
|  * | ||||
|  * Most of the code in bcache doesn't care about an individual bset - it needs | ||||
|  * to search entire btree nodes and iterate over them in sorted order. | ||||
|  * | ||||
|  * The btree iterator code serves both functions; it iterates through the keys | ||||
|  * in a btree node in sorted order, starting from either keys after a specific | ||||
|  * point (if you pass it a search key) or the start of the btree node. | ||||
|  * | ||||
|  * AUXILIARY SEARCH TREES: | ||||
|  * | ||||
|  * Since keys are variable length, we can't use a binary search on a bset - we | ||||
|  * wouldn't be able to find the start of the next key. But binary searches are | ||||
|  * slow anyways, due to terrible cache behaviour; bcache originally used binary | ||||
|  * searches and that code topped out at under 50k lookups/second. | ||||
|  * | ||||
|  * So we need to construct some sort of lookup table. Since we only insert keys | ||||
|  * into the last (unwritten) set, most of the keys within a given btree node are | ||||
|  * usually in sets that are mostly constant. We use two different types of | ||||
|  * lookup tables to take advantage of this. | ||||
|  * | ||||
|  * Both lookup tables share in common that they don't index every key in the | ||||
|  * set; they index one key every BSET_CACHELINE bytes, and then a linear search | ||||
|  * is used for the rest. | ||||
|  * | ||||
|  * For sets that have been written to disk and are no longer being inserted | ||||
|  * into, we construct a binary search tree in an array - traversing a binary | ||||
|  * search tree in an array gives excellent locality of reference and is very | ||||
|  * fast, since both children of any node are adjacent to each other in memory | ||||
|  * (and their grandchildren, and great grandchildren...) - this means | ||||
|  * prefetching can be used to great effect. | ||||
|  * | ||||
|  * It's quite useful performance wise to keep these nodes small - not just | ||||
|  * because they're more likely to be in L2, but also because we can prefetch | ||||
|  * more nodes on a single cacheline and thus prefetch more iterations in advance | ||||
|  * when traversing this tree. | ||||
|  * | ||||
|  * Nodes in the auxiliary search tree must contain both a key to compare against | ||||
|  * (we don't want to fetch the key from the set, that would defeat the purpose), | ||||
|  * and a pointer to the key. We use a few tricks to compress both of these. | ||||
|  * | ||||
|  * To compress the pointer, we take advantage of the fact that one node in the | ||||
|  * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have | ||||
|  * a function (to_inorder()) that takes the index of a node in a binary tree and | ||||
|  * returns what its index would be in an inorder traversal, so we only have to | ||||
|  * store the low bits of the offset. | ||||
|  * | ||||
|  * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To | ||||
|  * compress that,  we take advantage of the fact that when we're traversing the | ||||
|  * search tree at every iteration we know that both our search key and the key | ||||
|  * we're looking for lie within some range - bounded by our previous | ||||
|  * comparisons. (We special case the start of a search so that this is true even | ||||
|  * at the root of the tree). | ||||
|  * | ||||
|  * So we know the key we're looking for is between a and b, and a and b don't | ||||
|  * differ higher than bit 50, we don't need to check anything higher than bit | ||||
|  * 50. | ||||
|  * | ||||
|  * We don't usually need the rest of the bits, either; we only need enough bits | ||||
|  * to partition the key range we're currently checking.  Consider key n - the | ||||
|  * key our auxiliary search tree node corresponds to, and key p, the key | ||||
|  * immediately preceding n.  The lowest bit we need to store in the auxiliary | ||||
|  * search tree is the highest bit that differs between n and p. | ||||
|  * | ||||
|  * Note that this could be bit 0 - we might sometimes need all 80 bits to do the | ||||
|  * comparison. But we'd really like our nodes in the auxiliary search tree to be | ||||
|  * of fixed size. | ||||
|  * | ||||
|  * The solution is to make them fixed size, and when we're constructing a node | ||||
|  * check if p and n differed in the bits we needed them to. If they don't we | ||||
|  * flag that node, and when doing lookups we fallback to comparing against the | ||||
|  * real key. As long as this doesn't happen to often (and it seems to reliably | ||||
|  * happen a bit less than 1% of the time), we win - even on failures, that key | ||||
|  * is then more likely to be in cache than if we were doing binary searches all | ||||
|  * the way, since we're touching so much less memory. | ||||
|  * | ||||
|  * The keys in the auxiliary search tree are stored in (software) floating | ||||
|  * point, with an exponent and a mantissa. The exponent needs to be big enough | ||||
|  * to address all the bits in the original key, but the number of bits in the | ||||
|  * mantissa is somewhat arbitrary; more bits just gets us fewer failures. | ||||
|  * | ||||
|  * We need 7 bits for the exponent and 3 bits for the key's offset (since keys | ||||
|  * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. | ||||
|  * We need one node per 128 bytes in the btree node, which means the auxiliary | ||||
|  * search trees take up 3% as much memory as the btree itself. | ||||
|  * | ||||
|  * Constructing these auxiliary search trees is moderately expensive, and we | ||||
|  * don't want to be constantly rebuilding the search tree for the last set | ||||
|  * whenever we insert another key into it. For the unwritten set, we use a much | ||||
|  * simpler lookup table - it's just a flat array, so index i in the lookup table | ||||
|  * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing | ||||
|  * within each byte range works the same as with the auxiliary search trees. | ||||
|  * | ||||
|  * These are much easier to keep up to date when we insert a key - we do it | ||||
|  * somewhat lazily; when we shift a key up we usually just increment the pointer | ||||
|  * to it, only when it would overflow do we go to the trouble of finding the | ||||
|  * first key in that range of bytes again. | ||||
|  */ | ||||
| 
 | ||||
| /* Btree key comparison/iteration */ | ||||
| 
 | ||||
| struct btree_iter { | ||||
| 	size_t size, used; | ||||
| 	struct btree_iter_set { | ||||
| 		struct bkey *k, *end; | ||||
| 	} data[MAX_BSETS]; | ||||
| }; | ||||
| 
 | ||||
| struct bset_tree { | ||||
| 	/*
 | ||||
| 	 * We construct a binary tree in an array as if the array | ||||
| 	 * started at 1, so that things line up on the same cachelines | ||||
| 	 * better: see comments in bset.c at cacheline_to_bkey() for | ||||
| 	 * details | ||||
| 	 */ | ||||
| 
 | ||||
| 	/* size of the binary tree and prev array */ | ||||
| 	unsigned	size; | ||||
| 
 | ||||
| 	/* function of size - precalculated for to_inorder() */ | ||||
| 	unsigned	extra; | ||||
| 
 | ||||
| 	/* copy of the last key in the set */ | ||||
| 	struct bkey	end; | ||||
| 	struct bkey_float *tree; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The nodes in the bset tree point to specific keys - this | ||||
| 	 * array holds the sizes of the previous key. | ||||
| 	 * | ||||
| 	 * Conceptually it's a member of struct bkey_float, but we want | ||||
| 	 * to keep bkey_float to 4 bytes and prev isn't used in the fast | ||||
| 	 * path. | ||||
| 	 */ | ||||
| 	uint8_t		*prev; | ||||
| 
 | ||||
| 	/* The actual btree node, with pointers to each sorted set */ | ||||
| 	struct bset	*data; | ||||
| }; | ||||
| 
 | ||||
| static __always_inline int64_t bkey_cmp(const struct bkey *l, | ||||
| 					const struct bkey *r) | ||||
| { | ||||
| 	return unlikely(KEY_INODE(l) != KEY_INODE(r)) | ||||
| 		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) | ||||
| 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | ||||
| } | ||||
| 
 | ||||
| static inline size_t bkey_u64s(const struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(KEY_CSUM(k) > 1); | ||||
| 	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); | ||||
| } | ||||
| 
 | ||||
| static inline size_t bkey_bytes(const struct bkey *k) | ||||
| { | ||||
| 	return bkey_u64s(k) * sizeof(uint64_t); | ||||
| } | ||||
| 
 | ||||
| static inline void bkey_copy(struct bkey *dest, const struct bkey *src) | ||||
| { | ||||
| 	memcpy(dest, src, bkey_bytes(src)); | ||||
| } | ||||
| 
 | ||||
| static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) | ||||
| { | ||||
| 	if (!src) | ||||
| 		src = &KEY(0, 0, 0); | ||||
| 
 | ||||
| 	SET_KEY_INODE(dest, KEY_INODE(src)); | ||||
| 	SET_KEY_OFFSET(dest, KEY_OFFSET(src)); | ||||
| } | ||||
| 
 | ||||
| static inline struct bkey *bkey_next(const struct bkey *k) | ||||
| { | ||||
| 	uint64_t *d = (void *) k; | ||||
| 	return (struct bkey *) (d + bkey_u64s(k)); | ||||
| } | ||||
| 
 | ||||
| /* Keylists */ | ||||
| 
 | ||||
| struct keylist { | ||||
| 	struct bkey		*top; | ||||
| 	union { | ||||
| 		uint64_t		*list; | ||||
| 		struct bkey		*bottom; | ||||
| 	}; | ||||
| 
 | ||||
| 	/* Enough room for btree_split's keys without realloc */ | ||||
| #define KEYLIST_INLINE		16 | ||||
| 	uint64_t		d[KEYLIST_INLINE]; | ||||
| }; | ||||
| 
 | ||||
| static inline void bch_keylist_init(struct keylist *l) | ||||
| { | ||||
| 	l->top = (void *) (l->list = l->d); | ||||
| } | ||||
| 
 | ||||
| static inline void bch_keylist_push(struct keylist *l) | ||||
| { | ||||
| 	l->top = bkey_next(l->top); | ||||
| } | ||||
| 
 | ||||
| static inline void bch_keylist_add(struct keylist *l, struct bkey *k) | ||||
| { | ||||
| 	bkey_copy(l->top, k); | ||||
| 	bch_keylist_push(l); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_keylist_empty(struct keylist *l) | ||||
| { | ||||
| 	return l->top == (void *) l->list; | ||||
| } | ||||
| 
 | ||||
| static inline void bch_keylist_free(struct keylist *l) | ||||
| { | ||||
| 	if (l->list != l->d) | ||||
| 		kfree(l->list); | ||||
| } | ||||
| 
 | ||||
| void bch_keylist_copy(struct keylist *, struct keylist *); | ||||
| struct bkey *bch_keylist_pop(struct keylist *); | ||||
| int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | ||||
| 
 | ||||
| void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||||
| 			      unsigned); | ||||
| bool __bch_cut_front(const struct bkey *, struct bkey *); | ||||
| bool __bch_cut_back(const struct bkey *, struct bkey *); | ||||
| 
 | ||||
| static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, k) > 0); | ||||
| 	return __bch_cut_front(where, k); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||||
| 	return __bch_cut_back(where, k); | ||||
| } | ||||
| 
 | ||||
| const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||||
| bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); | ||||
| bool bch_ptr_bad(struct btree *, const struct bkey *); | ||||
| 
 | ||||
| static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||||
| { | ||||
| 	uint8_t r = a - b; | ||||
| 	return r > 128U ? 0 : r; | ||||
| } | ||||
| 
 | ||||
| static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||||
| 				unsigned i) | ||||
| { | ||||
| 	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||||
| } | ||||
| 
 | ||||
| static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||||
| 				 unsigned i) | ||||
| { | ||||
| 	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||||
| 
 | ||||
| struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); | ||||
| struct bkey *bch_btree_iter_next(struct btree_iter *); | ||||
| struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||||
| 					struct btree *, ptr_filter_fn); | ||||
| 
 | ||||
| void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||||
| struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||||
| 				   struct bkey *, struct bset_tree *); | ||||
| 
 | ||||
| /* 32 bits total: */ | ||||
| #define BKEY_MID_BITS		3 | ||||
| #define BKEY_EXPONENT_BITS	7 | ||||
| #define BKEY_MANTISSA_BITS	22 | ||||
| #define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1) | ||||
| 
 | ||||
| struct bkey_float { | ||||
| 	unsigned	exponent:BKEY_EXPONENT_BITS; | ||||
| 	unsigned	m:BKEY_MID_BITS; | ||||
| 	unsigned	mantissa:BKEY_MANTISSA_BITS; | ||||
| } __packed; | ||||
| 
 | ||||
| /*
 | ||||
|  * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||||
|  * it used to be 64, but I realized the lookup code would touch slightly less | ||||
|  * memory if it was 128. | ||||
|  * | ||||
|  * It definites the number of bytes (in struct bset) per struct bkey_float in | ||||
|  * the auxiliar search tree - when we're done searching the bset_float tree we | ||||
|  * have this many bytes left that we do a linear search over. | ||||
|  * | ||||
|  * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||||
|  * we're touching one fewer cacheline in the bset tree in exchange for one more | ||||
|  * cacheline in the linear search - but the linear search might stop before it | ||||
|  * gets to the second cacheline. | ||||
|  */ | ||||
| 
 | ||||
| #define BSET_CACHELINE		128 | ||||
| #define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE) | ||||
| 
 | ||||
| #define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float)) | ||||
| #define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t)) | ||||
| 
 | ||||
| void bch_bset_init_next(struct btree *); | ||||
| 
 | ||||
| void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | ||||
| void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | ||||
| 
 | ||||
| struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | ||||
| 			   const struct bkey *); | ||||
| 
 | ||||
| static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||||
| 					   const struct bkey *search) | ||||
| { | ||||
| 	return search ? __bch_bset_search(b, t, search) : t->data->start; | ||||
| } | ||||
| 
 | ||||
| bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||||
| void bch_btree_sort_lazy(struct btree *); | ||||
| void bch_btree_sort_into(struct btree *, struct btree *); | ||||
| void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||||
| void bch_btree_sort_partial(struct btree *, unsigned); | ||||
| 
 | ||||
| static inline void bch_btree_sort(struct btree *b) | ||||
| { | ||||
| 	bch_btree_sort_partial(b, 0); | ||||
| } | ||||
| 
 | ||||
| int bch_bset_print_stats(struct cache_set *, char *); | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										2503
									
								
								drivers/md/bcache/btree.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										2503
									
								
								drivers/md/bcache/btree.c
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										405
									
								
								drivers/md/bcache/btree.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										405
									
								
								drivers/md/bcache/btree.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,405 @@ | |||
| #ifndef _BCACHE_BTREE_H | ||||
| #define _BCACHE_BTREE_H | ||||
| 
 | ||||
| /*
 | ||||
|  * THE BTREE: | ||||
|  * | ||||
|  * At a high level, bcache's btree is relatively standard b+ tree. All keys and | ||||
|  * pointers are in the leaves; interior nodes only have pointers to the child | ||||
|  * nodes. | ||||
|  * | ||||
|  * In the interior nodes, a struct bkey always points to a child btree node, and | ||||
|  * the key is the highest key in the child node - except that the highest key in | ||||
|  * an interior node is always MAX_KEY. The size field refers to the size on disk | ||||
|  * of the child node - this would allow us to have variable sized btree nodes | ||||
|  * (handy for keeping the depth of the btree 1 by expanding just the root). | ||||
|  * | ||||
|  * Btree nodes are themselves log structured, but this is hidden fairly | ||||
|  * thoroughly. Btree nodes on disk will in practice have extents that overlap | ||||
|  * (because they were written at different times), but in memory we never have | ||||
|  * overlapping extents - when we read in a btree node from disk, the first thing | ||||
|  * we do is resort all the sets of keys with a mergesort, and in the same pass | ||||
|  * we check for overlapping extents and adjust them appropriately. | ||||
|  * | ||||
|  * struct btree_op is a central interface to the btree code. It's used for | ||||
|  * specifying read vs. write locking, and the embedded closure is used for | ||||
|  * waiting on IO or reserve memory. | ||||
|  * | ||||
|  * BTREE CACHE: | ||||
|  * | ||||
|  * Btree nodes are cached in memory; traversing the btree might require reading | ||||
|  * in btree nodes which is handled mostly transparently. | ||||
|  * | ||||
|  * bch_btree_node_get() looks up a btree node in the cache and reads it in from | ||||
|  * disk if necessary. This function is almost never called directly though - the | ||||
|  * btree() macro is used to get a btree node, call some function on it, and | ||||
|  * unlock the node after the function returns. | ||||
|  * | ||||
|  * The root is special cased - it's taken out of the cache's lru (thus pinning | ||||
|  * it in memory), so we can find the root of the btree by just dereferencing a | ||||
|  * pointer instead of looking it up in the cache. This makes locking a bit | ||||
|  * tricky, since the root pointer is protected by the lock in the btree node it | ||||
|  * points to - the btree_root() macro handles this. | ||||
|  * | ||||
|  * In various places we must be able to allocate memory for multiple btree nodes | ||||
|  * in order to make forward progress. To do this we use the btree cache itself | ||||
|  * as a reserve; if __get_free_pages() fails, we'll find a node in the btree | ||||
|  * cache we can reuse. We can't allow more than one thread to be doing this at a | ||||
|  * time, so there's a lock, implemented by a pointer to the btree_op closure - | ||||
|  * this allows the btree_root() macro to implicitly release this lock. | ||||
|  * | ||||
|  * BTREE IO: | ||||
|  * | ||||
|  * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles | ||||
|  * this. | ||||
|  * | ||||
|  * For writing, we have two btree_write structs embeddded in struct btree - one | ||||
|  * write in flight, and one being set up, and we toggle between them. | ||||
|  * | ||||
|  * Writing is done with a single function -  bch_btree_write() really serves two | ||||
|  * different purposes and should be broken up into two different functions. When | ||||
|  * passing now = false, it merely indicates that the node is now dirty - calling | ||||
|  * it ensures that the dirty keys will be written at some point in the future. | ||||
|  * | ||||
|  * When passing now = true, bch_btree_write() causes a write to happen | ||||
|  * "immediately" (if there was already a write in flight, it'll cause the write | ||||
|  * to happen as soon as the previous write completes). It returns immediately | ||||
|  * though - but it takes a refcount on the closure in struct btree_op you passed | ||||
|  * to it, so a closure_sync() later can be used to wait for the write to | ||||
|  * complete. | ||||
|  * | ||||
|  * This is handy because btree_split() and garbage collection can issue writes | ||||
|  * in parallel, reducing the amount of time they have to hold write locks. | ||||
|  * | ||||
|  * LOCKING: | ||||
|  * | ||||
|  * When traversing the btree, we may need write locks starting at some level - | ||||
|  * inserting a key into the btree will typically only require a write lock on | ||||
|  * the leaf node. | ||||
|  * | ||||
|  * This is specified with the lock field in struct btree_op; lock = 0 means we | ||||
|  * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() | ||||
|  * checks this field and returns the node with the appropriate lock held. | ||||
|  * | ||||
|  * If, after traversing the btree, the insertion code discovers it has to split | ||||
|  * then it must restart from the root and take new locks - to do this it changes | ||||
|  * the lock field and returns -EINTR, which causes the btree_root() macro to | ||||
|  * loop. | ||||
|  * | ||||
|  * Handling cache misses require a different mechanism for upgrading to a write | ||||
|  * lock. We do cache lookups with only a read lock held, but if we get a cache | ||||
|  * miss and we wish to insert this data into the cache, we have to insert a | ||||
|  * placeholder key to detect races - otherwise, we could race with a write and | ||||
|  * overwrite the data that was just written to the cache with stale data from | ||||
|  * the backing device. | ||||
|  * | ||||
|  * For this we use a sequence number that write locks and unlocks increment - to | ||||
|  * insert the check key it unlocks the btree node and then takes a write lock, | ||||
|  * and fails if the sequence number doesn't match. | ||||
|  */ | ||||
| 
 | ||||
| #include "bset.h" | ||||
| #include "debug.h" | ||||
| 
 | ||||
| struct btree_write { | ||||
| 	struct closure		*owner; | ||||
| 	atomic_t		*journal; | ||||
| 
 | ||||
| 	/* If btree_split() frees a btree node, it writes a new pointer to that
 | ||||
| 	 * btree node indicating it was freed; it takes a refcount on | ||||
| 	 * c->prio_blocked because we can't write the gens until the new | ||||
| 	 * pointer is on disk. This allows btree_write_endio() to release the | ||||
| 	 * refcount that btree_split() took. | ||||
| 	 */ | ||||
| 	int			prio_blocked; | ||||
| }; | ||||
| 
 | ||||
| struct btree { | ||||
| 	/* Hottest entries first */ | ||||
| 	struct hlist_node	hash; | ||||
| 
 | ||||
| 	/* Key/pointer for this btree node */ | ||||
| 	BKEY_PADDED(key); | ||||
| 
 | ||||
| 	/* Single bit - set when accessed, cleared by shrinker */ | ||||
| 	unsigned long		accessed; | ||||
| 	unsigned long		seq; | ||||
| 	struct rw_semaphore	lock; | ||||
| 	struct cache_set	*c; | ||||
| 
 | ||||
| 	unsigned long		flags; | ||||
| 	uint16_t		written;	/* would be nice to kill */ | ||||
| 	uint8_t			level; | ||||
| 	uint8_t			nsets; | ||||
| 	uint8_t			page_order; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Set of sorted keys - the real btree node - plus a binary search tree | ||||
| 	 * | ||||
| 	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||||
| 	 * to the memory we have allocated for this btree node. Additionally, | ||||
| 	 * set[0]->data points to the entire btree node as it exists on disk. | ||||
| 	 */ | ||||
| 	struct bset_tree	sets[MAX_BSETS]; | ||||
| 
 | ||||
| 	/* Used to refcount bio splits, also protects b->bio */ | ||||
| 	struct closure_with_waitlist	io; | ||||
| 
 | ||||
| 	/* Gets transferred to w->prio_blocked - see the comment there */ | ||||
| 	int			prio_blocked; | ||||
| 
 | ||||
| 	struct list_head	list; | ||||
| 	struct delayed_work	work; | ||||
| 
 | ||||
| 	uint64_t		io_start_time; | ||||
| 	struct btree_write	writes[2]; | ||||
| 	struct bio		*bio; | ||||
| }; | ||||
| 
 | ||||
| #define BTREE_FLAG(flag)						\ | ||||
| static inline bool btree_node_ ## flag(struct btree *b)			\ | ||||
| {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\ | ||||
| 									\ | ||||
| static inline void set_btree_node_ ## flag(struct btree *b)		\ | ||||
| {	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\ | ||||
| 
 | ||||
| enum btree_flags { | ||||
| 	BTREE_NODE_read_done, | ||||
| 	BTREE_NODE_io_error, | ||||
| 	BTREE_NODE_dirty, | ||||
| 	BTREE_NODE_write_idx, | ||||
| }; | ||||
| 
 | ||||
| BTREE_FLAG(read_done); | ||||
| BTREE_FLAG(io_error); | ||||
| BTREE_FLAG(dirty); | ||||
| BTREE_FLAG(write_idx); | ||||
| 
 | ||||
| static inline struct btree_write *btree_current_write(struct btree *b) | ||||
| { | ||||
| 	return b->writes + btree_node_write_idx(b); | ||||
| } | ||||
| 
 | ||||
| static inline struct btree_write *btree_prev_write(struct btree *b) | ||||
| { | ||||
| 	return b->writes + (btree_node_write_idx(b) ^ 1); | ||||
| } | ||||
| 
 | ||||
| static inline unsigned bset_offset(struct btree *b, struct bset *i) | ||||
| { | ||||
| 	return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | ||||
| } | ||||
| 
 | ||||
| static inline struct bset *write_block(struct btree *b) | ||||
| { | ||||
| 	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | ||||
| } | ||||
| 
 | ||||
| static inline bool bset_written(struct btree *b, struct bset_tree *t) | ||||
| { | ||||
| 	return t->data < write_block(b); | ||||
| } | ||||
| 
 | ||||
| static inline bool bkey_written(struct btree *b, struct bkey *k) | ||||
| { | ||||
| 	return k < write_block(b)->start; | ||||
| } | ||||
| 
 | ||||
| static inline void set_gc_sectors(struct cache_set *c) | ||||
| { | ||||
| 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||||
| { | ||||
| 	return __bch_ptr_invalid(b->c, b->level, k); | ||||
| } | ||||
| 
 | ||||
| static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||||
| 					       struct btree_iter *iter, | ||||
| 					       struct bkey *search) | ||||
| { | ||||
| 	return __bch_btree_iter_init(b, iter, search, b->sets); | ||||
| } | ||||
| 
 | ||||
| /* Looping macros */ | ||||
| 
 | ||||
| #define for_each_cached_btree(b, c, iter)				\ | ||||
| 	for (iter = 0;							\ | ||||
| 	     iter < ARRAY_SIZE((c)->bucket_hash);			\ | ||||
| 	     iter++)							\ | ||||
| 		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | ||||
| 
 | ||||
| #define for_each_key_filter(b, k, iter, filter)				\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||||
| 
 | ||||
| #define for_each_key(b, k, iter)					\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next(iter));) | ||||
| 
 | ||||
| /* Recursing down the btree */ | ||||
| 
 | ||||
| struct btree_op { | ||||
| 	struct closure		cl; | ||||
| 	struct cache_set	*c; | ||||
| 
 | ||||
| 	/* Journal entry we have a refcount on */ | ||||
| 	atomic_t		*journal; | ||||
| 
 | ||||
| 	/* Bio to be inserted into the cache */ | ||||
| 	struct bio		*cache_bio; | ||||
| 
 | ||||
| 	unsigned		inode; | ||||
| 
 | ||||
| 	uint16_t		write_prio; | ||||
| 
 | ||||
| 	/* Btree level at which we start taking write locks */ | ||||
| 	short			lock; | ||||
| 
 | ||||
| 	/* Btree insertion type */ | ||||
| 	enum { | ||||
| 		BTREE_INSERT, | ||||
| 		BTREE_REPLACE | ||||
| 	} type:8; | ||||
| 
 | ||||
| 	unsigned		csum:1; | ||||
| 	unsigned		skip:1; | ||||
| 	unsigned		flush_journal:1; | ||||
| 
 | ||||
| 	unsigned		insert_data_done:1; | ||||
| 	unsigned		lookup_done:1; | ||||
| 	unsigned		insert_collision:1; | ||||
| 
 | ||||
| 	/* Anything after this point won't get zeroed in do_bio_hook() */ | ||||
| 
 | ||||
| 	/* Keys to be inserted */ | ||||
| 	struct keylist		keys; | ||||
| 	BKEY_PADDED(replace); | ||||
| }; | ||||
| 
 | ||||
| void bch_btree_op_init_stack(struct btree_op *); | ||||
| 
 | ||||
| static inline void rw_lock(bool w, struct btree *b, int level) | ||||
| { | ||||
| 	w ? down_write_nested(&b->lock, level + 1) | ||||
| 	  : down_read_nested(&b->lock, level + 1); | ||||
| 	if (w) | ||||
| 		b->seq++; | ||||
| } | ||||
| 
 | ||||
| static inline void rw_unlock(bool w, struct btree *b) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_EDEBUG | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	if (w && | ||||
| 	    b->key.ptr[0] && | ||||
| 	    btree_node_read_done(b)) | ||||
| 		for (i = 0; i <= b->nsets; i++) | ||||
| 			bch_check_key_order(b, b->sets[i].data); | ||||
| #endif | ||||
| 
 | ||||
| 	if (w) | ||||
| 		b->seq++; | ||||
| 	(w ? up_write : up_read)(&b->lock); | ||||
| } | ||||
| 
 | ||||
| #define insert_lock(s, b)	((b)->level <= (s)->lock) | ||||
| 
 | ||||
| /*
 | ||||
|  * These macros are for recursing down the btree - they handle the details of | ||||
|  * locking and looking up nodes in the cache for you. They're best treated as | ||||
|  * mere syntax when reading code that uses them. | ||||
|  * | ||||
|  * op->lock determines whether we take a read or a write lock at a given depth. | ||||
|  * If you've got a read lock and find that you need a write lock (i.e. you're | ||||
|  * going to have to split), set op->lock and return -EINTR; btree_root() will | ||||
|  * call you again and you'll have the correct lock. | ||||
|  */ | ||||
| 
 | ||||
| /**
 | ||||
|  * btree - recurse down the btree on a specified key | ||||
|  * @fn:		function to call, which will be passed the child node | ||||
|  * @key:	key to recurse on | ||||
|  * @b:		parent btree node | ||||
|  * @op:		pointer to struct btree_op | ||||
|  */ | ||||
| #define btree(fn, key, b, op, ...)					\ | ||||
| ({									\ | ||||
| 	int _r, l = (b)->level - 1;					\ | ||||
| 	bool _w = l <= (op)->lock;					\ | ||||
| 	struct btree *_b = bch_btree_node_get((b)->c, key, l, op);	\ | ||||
| 	if (!IS_ERR(_b)) {						\ | ||||
| 		_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);		\ | ||||
| 		rw_unlock(_w, _b);					\ | ||||
| 	} else								\ | ||||
| 		_r = PTR_ERR(_b);					\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| /**
 | ||||
|  * btree_root - call a function on the root of the btree | ||||
|  * @fn:		function to call, which will be passed the child node | ||||
|  * @c:		cache set | ||||
|  * @op:		pointer to struct btree_op | ||||
|  */ | ||||
| #define btree_root(fn, c, op, ...)					\ | ||||
| ({									\ | ||||
| 	int _r = -EINTR;						\ | ||||
| 	do {								\ | ||||
| 		struct btree *_b = (c)->root;				\ | ||||
| 		bool _w = insert_lock(op, _b);				\ | ||||
| 		rw_lock(_w, _b, _b->level);				\ | ||||
| 		if (_b == (c)->root &&					\ | ||||
| 		    _w == insert_lock(op, _b))				\ | ||||
| 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\ | ||||
| 		rw_unlock(_w, _b);					\ | ||||
| 		bch_cannibalize_unlock(c, &(op)->cl);		\ | ||||
| 	} while (_r == -EINTR);						\ | ||||
| 									\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| static inline bool should_split(struct btree *b) | ||||
| { | ||||
| 	struct bset *i = write_block(b); | ||||
| 	return b->written >= btree_blocks(b) || | ||||
| 		(i->seq == b->sets[0].data->seq && | ||||
| 		 b->written + __set_blocks(i, i->keys + 15, b->c) | ||||
| 		 > btree_blocks(b)); | ||||
| } | ||||
| 
 | ||||
| void bch_btree_read_done(struct closure *); | ||||
| void bch_btree_read(struct btree *); | ||||
| void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||||
| 
 | ||||
| void bch_cannibalize_unlock(struct cache_set *, struct closure *); | ||||
| void bch_btree_set_root(struct btree *); | ||||
| struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | ||||
| struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | ||||
| 				int, struct btree_op *); | ||||
| 
 | ||||
| bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||||
| bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | ||||
| 				   struct bio *); | ||||
| int bch_btree_insert(struct btree_op *, struct cache_set *); | ||||
| 
 | ||||
| int bch_btree_search_recurse(struct btree *, struct btree_op *); | ||||
| 
 | ||||
| void bch_queue_gc(struct cache_set *); | ||||
| size_t bch_btree_gc_finish(struct cache_set *); | ||||
| void bch_moving_gc(struct closure *); | ||||
| int bch_btree_check(struct cache_set *, struct btree_op *); | ||||
| uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | ||||
| 
 | ||||
| void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | ||||
| void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | ||||
| bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | ||||
| 				  struct bkey *); | ||||
| void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | ||||
| struct keybuf_key *bch_keybuf_next(struct keybuf *); | ||||
| struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | ||||
| 					  struct keybuf *, struct bkey *); | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										348
									
								
								drivers/md/bcache/closure.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										348
									
								
								drivers/md/bcache/closure.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,348 @@ | |||
| /*
 | ||||
|  * Asynchronous refcounty things | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/debugfs.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/seq_file.h> | ||||
| 
 | ||||
| #include "closure.h" | ||||
| 
 | ||||
| void closure_queue(struct closure *cl) | ||||
| { | ||||
| 	struct workqueue_struct *wq = cl->wq; | ||||
| 	if (wq) { | ||||
| 		INIT_WORK(&cl->work, cl->work.func); | ||||
| 		BUG_ON(!queue_work(wq, &cl->work)); | ||||
| 	} else | ||||
| 		cl->fn(cl); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_queue); | ||||
| 
 | ||||
| #define CL_FIELD(type, field)					\ | ||||
| 	case TYPE_ ## type:					\ | ||||
| 	return &container_of(cl, struct type, cl)->field | ||||
| 
 | ||||
| static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||||
| { | ||||
| 	switch (cl->type) { | ||||
| 		CL_FIELD(closure_with_waitlist, wait); | ||||
| 		CL_FIELD(closure_with_waitlist_and_timer, wait); | ||||
| 	default: | ||||
| 		return NULL; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static struct timer_list *closure_timer(struct closure *cl) | ||||
| { | ||||
| 	switch (cl->type) { | ||||
| 		CL_FIELD(closure_with_timer, timer); | ||||
| 		CL_FIELD(closure_with_waitlist_and_timer, timer); | ||||
| 	default: | ||||
| 		return NULL; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static inline void closure_put_after_sub(struct closure *cl, int flags) | ||||
| { | ||||
| 	int r = flags & CLOSURE_REMAINING_MASK; | ||||
| 
 | ||||
| 	BUG_ON(flags & CLOSURE_GUARD_MASK); | ||||
| 	BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); | ||||
| 
 | ||||
| 	/* Must deliver precisely one wakeup */ | ||||
| 	if (r == 1 && (flags & CLOSURE_SLEEPING)) | ||||
| 		wake_up_process(cl->task); | ||||
| 
 | ||||
| 	if (!r) { | ||||
| 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { | ||||
| 			/* CLOSURE_BLOCKING might be set - clear it */ | ||||
| 			atomic_set(&cl->remaining, | ||||
| 				   CLOSURE_REMAINING_INITIALIZER); | ||||
| 			closure_queue(cl); | ||||
| 		} else { | ||||
| 			struct closure *parent = cl->parent; | ||||
| 			struct closure_waitlist *wait = closure_waitlist(cl); | ||||
| 
 | ||||
| 			closure_debug_destroy(cl); | ||||
| 
 | ||||
| 			atomic_set(&cl->remaining, -1); | ||||
| 
 | ||||
| 			if (wait) | ||||
| 				closure_wake_up(wait); | ||||
| 
 | ||||
| 			if (cl->fn) | ||||
| 				cl->fn(cl); | ||||
| 
 | ||||
| 			if (parent) | ||||
| 				closure_put(parent); | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /* For clearing flags with the same atomic op as a put */ | ||||
| void closure_sub(struct closure *cl, int v) | ||||
| { | ||||
| 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_sub); | ||||
| 
 | ||||
| void closure_put(struct closure *cl) | ||||
| { | ||||
| 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_put); | ||||
| 
 | ||||
| static void set_waiting(struct closure *cl, unsigned long f) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	cl->waiting_on = f; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| void __closure_wake_up(struct closure_waitlist *wait_list) | ||||
| { | ||||
| 	struct llist_node *list; | ||||
| 	struct closure *cl; | ||||
| 	struct llist_node *reverse = NULL; | ||||
| 
 | ||||
| 	list = llist_del_all(&wait_list->list); | ||||
| 
 | ||||
| 	/* We first reverse the list to preserve FIFO ordering and fairness */ | ||||
| 
 | ||||
| 	while (list) { | ||||
| 		struct llist_node *t = list; | ||||
| 		list = llist_next(list); | ||||
| 
 | ||||
| 		t->next = reverse; | ||||
| 		reverse = t; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Then do the wakeups */ | ||||
| 
 | ||||
| 	while (reverse) { | ||||
| 		cl = container_of(reverse, struct closure, list); | ||||
| 		reverse = llist_next(reverse); | ||||
| 
 | ||||
| 		set_waiting(cl, 0); | ||||
| 		closure_sub(cl, CLOSURE_WAITING + 1); | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(__closure_wake_up); | ||||
| 
 | ||||
| bool closure_wait(struct closure_waitlist *list, struct closure *cl) | ||||
| { | ||||
| 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | ||||
| 		return false; | ||||
| 
 | ||||
| 	set_waiting(cl, _RET_IP_); | ||||
| 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | ||||
| 	llist_add(&cl->list, &list->list); | ||||
| 
 | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_wait); | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_sync() - sleep until a closure a closure has nothing left to wait on | ||||
|  * | ||||
|  * Sleeps until the refcount hits 1 - the thread that's running the closure owns | ||||
|  * the last refcount. | ||||
|  */ | ||||
| void closure_sync(struct closure *cl) | ||||
| { | ||||
| 	while (1) { | ||||
| 		__closure_start_sleep(cl); | ||||
| 		closure_set_ret_ip(cl); | ||||
| 
 | ||||
| 		if ((atomic_read(&cl->remaining) & | ||||
| 		     CLOSURE_REMAINING_MASK) == 1) | ||||
| 			break; | ||||
| 
 | ||||
| 		schedule(); | ||||
| 	} | ||||
| 
 | ||||
| 	__closure_end_sleep(cl); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_sync); | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_trylock() - try to acquire the closure, without waiting | ||||
|  * @cl:		closure to lock | ||||
|  * | ||||
|  * Returns true if the closure was succesfully locked. | ||||
|  */ | ||||
| bool closure_trylock(struct closure *cl, struct closure *parent) | ||||
| { | ||||
| 	if (atomic_cmpxchg(&cl->remaining, -1, | ||||
| 			   CLOSURE_REMAINING_INITIALIZER) != -1) | ||||
| 		return false; | ||||
| 
 | ||||
| 	closure_set_ret_ip(cl); | ||||
| 
 | ||||
| 	smp_mb(); | ||||
| 	cl->parent = parent; | ||||
| 	if (parent) | ||||
| 		closure_get(parent); | ||||
| 
 | ||||
| 	closure_debug_create(cl); | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_trylock); | ||||
| 
 | ||||
| void __closure_lock(struct closure *cl, struct closure *parent, | ||||
| 		    struct closure_waitlist *wait_list) | ||||
| { | ||||
| 	struct closure wait; | ||||
| 	closure_init_stack(&wait); | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		if (closure_trylock(cl, parent)) | ||||
| 			return; | ||||
| 
 | ||||
| 		closure_wait_event_sync(wait_list, &wait, | ||||
| 					atomic_read(&cl->remaining) == -1); | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(__closure_lock); | ||||
| 
 | ||||
| static void closure_delay_timer_fn(unsigned long data) | ||||
| { | ||||
| 	struct closure *cl = (struct closure *) data; | ||||
| 	closure_sub(cl, CLOSURE_TIMER + 1); | ||||
| } | ||||
| 
 | ||||
| void do_closure_timer_init(struct closure *cl) | ||||
| { | ||||
| 	struct timer_list *timer = closure_timer(cl); | ||||
| 
 | ||||
| 	init_timer(timer); | ||||
| 	timer->data	= (unsigned long) cl; | ||||
| 	timer->function = closure_delay_timer_fn; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(do_closure_timer_init); | ||||
| 
 | ||||
| bool __closure_delay(struct closure *cl, unsigned long delay, | ||||
| 		     struct timer_list *timer) | ||||
| { | ||||
| 	if (atomic_read(&cl->remaining) & CLOSURE_TIMER) | ||||
| 		return false; | ||||
| 
 | ||||
| 	BUG_ON(timer_pending(timer)); | ||||
| 
 | ||||
| 	timer->expires	= jiffies + delay; | ||||
| 
 | ||||
| 	atomic_add(CLOSURE_TIMER + 1, &cl->remaining); | ||||
| 	add_timer(timer); | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(__closure_delay); | ||||
| 
 | ||||
| void __closure_flush(struct closure *cl, struct timer_list *timer) | ||||
| { | ||||
| 	if (del_timer(timer)) | ||||
| 		closure_sub(cl, CLOSURE_TIMER + 1); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(__closure_flush); | ||||
| 
 | ||||
| void __closure_flush_sync(struct closure *cl, struct timer_list *timer) | ||||
| { | ||||
| 	if (del_timer_sync(timer)) | ||||
| 		closure_sub(cl, CLOSURE_TIMER + 1); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(__closure_flush_sync); | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 
 | ||||
| static LIST_HEAD(closure_list); | ||||
| static DEFINE_SPINLOCK(closure_list_lock); | ||||
| 
 | ||||
| void closure_debug_create(struct closure *cl) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); | ||||
| 	cl->magic = CLOSURE_MAGIC_ALIVE; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&closure_list_lock, flags); | ||||
| 	list_add(&cl->all, &closure_list); | ||||
| 	spin_unlock_irqrestore(&closure_list_lock, flags); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_debug_create); | ||||
| 
 | ||||
| void closure_debug_destroy(struct closure *cl) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); | ||||
| 	cl->magic = CLOSURE_MAGIC_DEAD; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&closure_list_lock, flags); | ||||
| 	list_del(&cl->all); | ||||
| 	spin_unlock_irqrestore(&closure_list_lock, flags); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(closure_debug_destroy); | ||||
| 
 | ||||
| static struct dentry *debug; | ||||
| 
 | ||||
| #define work_data_bits(work) ((unsigned long *)(&(work)->data)) | ||||
| 
 | ||||
| static int debug_seq_show(struct seq_file *f, void *data) | ||||
| { | ||||
| 	struct closure *cl; | ||||
| 	spin_lock_irq(&closure_list_lock); | ||||
| 
 | ||||
| 	list_for_each_entry(cl, &closure_list, all) { | ||||
| 		int r = atomic_read(&cl->remaining); | ||||
| 
 | ||||
| 		seq_printf(f, "%p: %pF -> %pf p %p r %i ", | ||||
| 			   cl, (void *) cl->ip, cl->fn, cl->parent, | ||||
| 			   r & CLOSURE_REMAINING_MASK); | ||||
| 
 | ||||
| 		seq_printf(f, "%s%s%s%s%s%s\n", | ||||
| 			   test_bit(WORK_STRUCT_PENDING, | ||||
| 				    work_data_bits(&cl->work)) ? "Q" : "", | ||||
| 			   r & CLOSURE_RUNNING	? "R" : "", | ||||
| 			   r & CLOSURE_BLOCKING	? "B" : "", | ||||
| 			   r & CLOSURE_STACK	? "S" : "", | ||||
| 			   r & CLOSURE_SLEEPING	? "Sl" : "", | ||||
| 			   r & CLOSURE_TIMER	? "T" : ""); | ||||
| 
 | ||||
| 		if (r & CLOSURE_WAITING) | ||||
| 			seq_printf(f, " W %pF\n", | ||||
| 				   (void *) cl->waiting_on); | ||||
| 
 | ||||
| 		seq_printf(f, "\n"); | ||||
| 	} | ||||
| 
 | ||||
| 	spin_unlock_irq(&closure_list_lock); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int debug_seq_open(struct inode *inode, struct file *file) | ||||
| { | ||||
| 	return single_open(file, debug_seq_show, NULL); | ||||
| } | ||||
| 
 | ||||
| static const struct file_operations debug_ops = { | ||||
| 	.owner		= THIS_MODULE, | ||||
| 	.open		= debug_seq_open, | ||||
| 	.read		= seq_read, | ||||
| 	.release	= single_release | ||||
| }; | ||||
| 
 | ||||
| int __init closure_debug_init(void) | ||||
| { | ||||
| 	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| module_init(closure_debug_init); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); | ||||
| MODULE_LICENSE("GPL"); | ||||
							
								
								
									
										670
									
								
								drivers/md/bcache/closure.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										670
									
								
								drivers/md/bcache/closure.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,670 @@ | |||
| #ifndef _LINUX_CLOSURE_H | ||||
| #define _LINUX_CLOSURE_H | ||||
| 
 | ||||
| #include <linux/llist.h> | ||||
| #include <linux/sched.h> | ||||
| #include <linux/workqueue.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Closure is perhaps the most overused and abused term in computer science, but | ||||
|  * since I've been unable to come up with anything better you're stuck with it | ||||
|  * again. | ||||
|  * | ||||
|  * What are closures? | ||||
|  * | ||||
|  * They embed a refcount. The basic idea is they count "things that are in | ||||
|  * progress" - in flight bios, some other thread that's doing something else - | ||||
|  * anything you might want to wait on. | ||||
|  * | ||||
|  * The refcount may be manipulated with closure_get() and closure_put(). | ||||
|  * closure_put() is where many of the interesting things happen, when it causes | ||||
|  * the refcount to go to 0. | ||||
|  * | ||||
|  * Closures can be used to wait on things both synchronously and asynchronously, | ||||
|  * and synchronous and asynchronous use can be mixed without restriction. To | ||||
|  * wait synchronously, use closure_sync() - you will sleep until your closure's | ||||
|  * refcount hits 1. | ||||
|  * | ||||
|  * To wait asynchronously, use | ||||
|  *   continue_at(cl, next_function, workqueue); | ||||
|  * | ||||
|  * passing it, as you might expect, the function to run when nothing is pending | ||||
|  * and the workqueue to run that function out of. | ||||
|  * | ||||
|  * continue_at() also, critically, is a macro that returns the calling function. | ||||
|  * There's good reason for this. | ||||
|  * | ||||
|  * To use safely closures asynchronously, they must always have a refcount while | ||||
|  * they are running owned by the thread that is running them. Otherwise, suppose | ||||
|  * you submit some bios and wish to have a function run when they all complete: | ||||
|  * | ||||
|  * foo_endio(struct bio *bio, int error) | ||||
|  * { | ||||
|  *	closure_put(cl); | ||||
|  * } | ||||
|  * | ||||
|  * closure_init(cl); | ||||
|  * | ||||
|  * do_stuff(); | ||||
|  * closure_get(cl); | ||||
|  * bio1->bi_endio = foo_endio; | ||||
|  * bio_submit(bio1); | ||||
|  * | ||||
|  * do_more_stuff(); | ||||
|  * closure_get(cl); | ||||
|  * bio2->bi_endio = foo_endio; | ||||
|  * bio_submit(bio2); | ||||
|  * | ||||
|  * continue_at(cl, complete_some_read, system_wq); | ||||
|  * | ||||
|  * If closure's refcount started at 0, complete_some_read() could run before the | ||||
|  * second bio was submitted - which is almost always not what you want! More | ||||
|  * importantly, it wouldn't be possible to say whether the original thread or | ||||
|  * complete_some_read()'s thread owned the closure - and whatever state it was | ||||
|  * associated with! | ||||
|  * | ||||
|  * So, closure_init() initializes a closure's refcount to 1 - and when a | ||||
|  * closure_fn is run, the refcount will be reset to 1 first. | ||||
|  * | ||||
|  * Then, the rule is - if you got the refcount with closure_get(), release it | ||||
|  * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount | ||||
|  * on a closure because you called closure_init() or you were run out of a | ||||
|  * closure - _always_ use continue_at(). Doing so consistently will help | ||||
|  * eliminate an entire class of particularly pernicious races. | ||||
|  * | ||||
|  * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||||
|  * | ||||
|  * struct closure_waitlist list; | ||||
|  * closure_wait_event(list, cl, condition); | ||||
|  * closure_wake_up(wait_list); | ||||
|  * | ||||
|  * These work analagously to wait_event() and wake_up() - except that instead of | ||||
|  * operating on the current thread (for wait_event()) and lists of threads, they | ||||
|  * operate on an explicit closure and lists of closures. | ||||
|  * | ||||
|  * Because it's a closure we can now wait either synchronously or | ||||
|  * asynchronously. closure_wait_event() returns the current value of the | ||||
|  * condition, and if it returned false continue_at() or closure_sync() can be | ||||
|  * used to wait for it to become true. | ||||
|  * | ||||
|  * It's useful for waiting on things when you can't sleep in the context in | ||||
|  * which you must check the condition (perhaps a spinlock held, or you might be | ||||
|  * beneath generic_make_request() - in which case you can't sleep on IO). | ||||
|  * | ||||
|  * closure_wait_event() will wait either synchronously or asynchronously, | ||||
|  * depending on whether the closure is in blocking mode or not. You can pick a | ||||
|  * mode explicitly with closure_wait_event_sync() and | ||||
|  * closure_wait_event_async(), which do just what you might expect. | ||||
|  * | ||||
|  * Lastly, you might have a wait list dedicated to a specific event, and have no | ||||
|  * need for specifying the condition - you just want to wait until someone runs | ||||
|  * closure_wake_up() on the appropriate wait list. In that case, just use | ||||
|  * closure_wait(). It will return either true or false, depending on whether the | ||||
|  * closure was already on a wait list or not - a closure can only be on one wait | ||||
|  * list at a time. | ||||
|  * | ||||
|  * Parents: | ||||
|  * | ||||
|  * closure_init() takes two arguments - it takes the closure to initialize, and | ||||
|  * a (possibly null) parent. | ||||
|  * | ||||
|  * If parent is non null, the new closure will have a refcount for its lifetime; | ||||
|  * a closure is considered to be "finished" when its refcount hits 0 and the | ||||
|  * function to run is null. Hence | ||||
|  * | ||||
|  * continue_at(cl, NULL, NULL); | ||||
|  * | ||||
|  * returns up the (spaghetti) stack of closures, precisely like normal return | ||||
|  * returns up the C stack. continue_at() with non null fn is better thought of | ||||
|  * as doing a tail call. | ||||
|  * | ||||
|  * All this implies that a closure should typically be embedded in a particular | ||||
|  * struct (which its refcount will normally control the lifetime of), and that | ||||
|  * struct can very much be thought of as a stack frame. | ||||
|  * | ||||
|  * Locking: | ||||
|  * | ||||
|  * Closures are based on work items but they can be thought of as more like | ||||
|  * threads - in that like threads and unlike work items they have a well | ||||
|  * defined lifetime; they are created (with closure_init()) and eventually | ||||
|  * complete after a continue_at(cl, NULL, NULL). | ||||
|  * | ||||
|  * Suppose you've got some larger structure with a closure embedded in it that's | ||||
|  * used for periodically doing garbage collection. You only want one garbage | ||||
|  * collection happening at a time, so the natural thing to do is protect it with | ||||
|  * a lock. However, it's difficult to use a lock protecting a closure correctly | ||||
|  * because the unlock should come after the last continue_to() (additionally, if | ||||
|  * you're using the closure asynchronously a mutex won't work since a mutex has | ||||
|  * to be unlocked by the same process that locked it). | ||||
|  * | ||||
|  * So to make it less error prone and more efficient, we also have the ability | ||||
|  * to use closures as locks: | ||||
|  * | ||||
|  * closure_init_unlocked(); | ||||
|  * closure_trylock(); | ||||
|  * | ||||
|  * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||||
|  * it for you.  But for closure_lock(), we also need a wait list: | ||||
|  * | ||||
|  * struct closure_with_waitlist frobnicator_cl; | ||||
|  * | ||||
|  * closure_init_unlocked(&frobnicator_cl); | ||||
|  * closure_lock(&frobnicator_cl); | ||||
|  * | ||||
|  * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||||
|  * delayed_work embeds a work item and a timer_list. The important thing is, use | ||||
|  * it exactly like you would a regular closure and closure_put() will magically | ||||
|  * handle everything for you. | ||||
|  * | ||||
|  * We've got closures that embed timers, too. They're called, appropriately | ||||
|  * enough: | ||||
|  * struct closure_with_timer; | ||||
|  * | ||||
|  * This gives you access to closure_delay(). It takes a refcount for a specified | ||||
|  * number of jiffies - you could then call closure_sync() (for a slightly | ||||
|  * convoluted version of msleep()) or continue_at() - which gives you the same | ||||
|  * effect as using a delayed work item, except you can reuse the work_struct | ||||
|  * already embedded in struct closure. | ||||
|  * | ||||
|  * Lastly, there's struct closure_with_waitlist_and_timer. It does what you | ||||
|  * probably expect, if you happen to need the features of both. (You don't | ||||
|  * really want to know how all this is implemented, but if I've done my job | ||||
|  * right you shouldn't have to care). | ||||
|  */ | ||||
| 
 | ||||
| struct closure; | ||||
| typedef void (closure_fn) (struct closure *); | ||||
| 
 | ||||
| struct closure_waitlist { | ||||
| 	struct llist_head	list; | ||||
| }; | ||||
| 
 | ||||
| enum closure_type { | ||||
| 	TYPE_closure				= 0, | ||||
| 	TYPE_closure_with_waitlist		= 1, | ||||
| 	TYPE_closure_with_timer			= 2, | ||||
| 	TYPE_closure_with_waitlist_and_timer	= 3, | ||||
| 	MAX_CLOSURE_TYPE			= 3, | ||||
| }; | ||||
| 
 | ||||
| enum closure_state { | ||||
| 	/*
 | ||||
| 	 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of | ||||
| 	 * waiting asynchronously | ||||
| 	 * | ||||
| 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | ||||
| 	 * the thread that owns the closure, and cleared by the thread that's | ||||
| 	 * waking up the closure. | ||||
| 	 * | ||||
| 	 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep | ||||
| 	 * - indicates that cl->task is valid and closure_put() may wake it up. | ||||
| 	 * Only set or cleared by the thread that owns the closure. | ||||
| 	 * | ||||
| 	 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure | ||||
| 	 * has an outstanding timer. Must be set by the thread that owns the | ||||
| 	 * closure, and cleared by the timer function when the timer goes off. | ||||
| 	 * | ||||
| 	 * The rest are for debugging and don't affect behaviour: | ||||
| 	 * | ||||
| 	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by | ||||
| 	 * closure_init() and when closure_put() runs then next function), and | ||||
| 	 * must be cleared before remaining hits 0. Primarily to help guard | ||||
| 	 * against incorrect usage and accidentally transferring references. | ||||
| 	 * continue_at() and closure_return() clear it for you, if you're doing | ||||
| 	 * something unusual you can use closure_set_dead() which also helps | ||||
| 	 * annotate where references are being transferred. | ||||
| 	 * | ||||
| 	 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a | ||||
| 	 * closure with this flag set | ||||
| 	 */ | ||||
| 
 | ||||
| 	CLOSURE_BITS_START	= (1 << 19), | ||||
| 	CLOSURE_DESTRUCTOR	= (1 << 19), | ||||
| 	CLOSURE_BLOCKING	= (1 << 21), | ||||
| 	CLOSURE_WAITING		= (1 << 23), | ||||
| 	CLOSURE_SLEEPING	= (1 << 25), | ||||
| 	CLOSURE_TIMER		= (1 << 27), | ||||
| 	CLOSURE_RUNNING		= (1 << 29), | ||||
| 	CLOSURE_STACK		= (1 << 31), | ||||
| }; | ||||
| 
 | ||||
| #define CLOSURE_GUARD_MASK					\ | ||||
| 	((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|	\ | ||||
| 	  CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) | ||||
| 
 | ||||
| #define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1) | ||||
| #define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING) | ||||
| 
 | ||||
| struct closure { | ||||
| 	union { | ||||
| 		struct { | ||||
| 			struct workqueue_struct *wq; | ||||
| 			struct task_struct	*task; | ||||
| 			struct llist_node	list; | ||||
| 			closure_fn		*fn; | ||||
| 		}; | ||||
| 		struct work_struct	work; | ||||
| 	}; | ||||
| 
 | ||||
| 	struct closure		*parent; | ||||
| 
 | ||||
| 	atomic_t		remaining; | ||||
| 
 | ||||
| 	enum closure_type	type; | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| #define CLOSURE_MAGIC_DEAD	0xc054dead | ||||
| #define CLOSURE_MAGIC_ALIVE	0xc054a11e | ||||
| 
 | ||||
| 	unsigned		magic; | ||||
| 	struct list_head	all; | ||||
| 	unsigned long		ip; | ||||
| 	unsigned long		waiting_on; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| struct closure_with_waitlist { | ||||
| 	struct closure		cl; | ||||
| 	struct closure_waitlist	wait; | ||||
| }; | ||||
| 
 | ||||
| struct closure_with_timer { | ||||
| 	struct closure		cl; | ||||
| 	struct timer_list	timer; | ||||
| }; | ||||
| 
 | ||||
| struct closure_with_waitlist_and_timer { | ||||
| 	struct closure		cl; | ||||
| 	struct closure_waitlist	wait; | ||||
| 	struct timer_list	timer; | ||||
| }; | ||||
| 
 | ||||
| extern unsigned invalid_closure_type(void); | ||||
| 
 | ||||
| #define __CLOSURE_TYPE(cl, _t)						\ | ||||
| 	  __builtin_types_compatible_p(typeof(cl), struct _t)		\ | ||||
| 		? TYPE_ ## _t :						\ | ||||
| 
 | ||||
| #define __closure_type(cl)						\ | ||||
| (									\ | ||||
| 	__CLOSURE_TYPE(cl, closure)					\ | ||||
| 	__CLOSURE_TYPE(cl, closure_with_waitlist)			\ | ||||
| 	__CLOSURE_TYPE(cl, closure_with_timer)				\ | ||||
| 	__CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)		\ | ||||
| 	invalid_closure_type()						\ | ||||
| ) | ||||
| 
 | ||||
| void closure_sub(struct closure *cl, int v); | ||||
| void closure_put(struct closure *cl); | ||||
| void closure_queue(struct closure *cl); | ||||
| void __closure_wake_up(struct closure_waitlist *list); | ||||
| bool closure_wait(struct closure_waitlist *list, struct closure *cl); | ||||
| void closure_sync(struct closure *cl); | ||||
| 
 | ||||
| bool closure_trylock(struct closure *cl, struct closure *parent); | ||||
| void __closure_lock(struct closure *cl, struct closure *parent, | ||||
| 		    struct closure_waitlist *wait_list); | ||||
| 
 | ||||
| void do_closure_timer_init(struct closure *cl); | ||||
| bool __closure_delay(struct closure *cl, unsigned long delay, | ||||
| 		     struct timer_list *timer); | ||||
| void __closure_flush(struct closure *cl, struct timer_list *timer); | ||||
| void __closure_flush_sync(struct closure *cl, struct timer_list *timer); | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 
 | ||||
| void closure_debug_create(struct closure *cl); | ||||
| void closure_debug_destroy(struct closure *cl); | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void closure_debug_create(struct closure *cl) {} | ||||
| static inline void closure_debug_destroy(struct closure *cl) {} | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| static inline void closure_set_ip(struct closure *cl) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	cl->ip = _THIS_IP_; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline void closure_set_ret_ip(struct closure *cl) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	cl->ip = _RET_IP_; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline void closure_get(struct closure *cl) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	BUG_ON((atomic_inc_return(&cl->remaining) & | ||||
| 		CLOSURE_REMAINING_MASK) <= 1); | ||||
| #else | ||||
| 	atomic_inc(&cl->remaining); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline void closure_set_stopped(struct closure *cl) | ||||
| { | ||||
| 	atomic_sub(CLOSURE_RUNNING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| static inline bool closure_is_stopped(struct closure *cl) | ||||
| { | ||||
| 	return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); | ||||
| } | ||||
| 
 | ||||
| static inline bool closure_is_unlocked(struct closure *cl) | ||||
| { | ||||
| 	return atomic_read(&cl->remaining) == -1; | ||||
| } | ||||
| 
 | ||||
| static inline void do_closure_init(struct closure *cl, struct closure *parent, | ||||
| 				   bool running) | ||||
| { | ||||
| 	switch (cl->type) { | ||||
| 	case TYPE_closure_with_timer: | ||||
| 	case TYPE_closure_with_waitlist_and_timer: | ||||
| 		do_closure_timer_init(cl); | ||||
| 	default: | ||||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
| 	cl->parent = parent; | ||||
| 	if (parent) | ||||
| 		closure_get(parent); | ||||
| 
 | ||||
| 	if (running) { | ||||
| 		closure_debug_create(cl); | ||||
| 		atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||||
| 	} else | ||||
| 		atomic_set(&cl->remaining, -1); | ||||
| 
 | ||||
| 	closure_set_ip(cl); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | ||||
|  * the result of __closure_type() is thrown away, it's used merely for type | ||||
|  * checking. | ||||
|  */ | ||||
| #define __to_internal_closure(cl)				\ | ||||
| ({								\ | ||||
| 	BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);	\ | ||||
| 	(struct closure *) cl;					\ | ||||
| }) | ||||
| 
 | ||||
| #define closure_init_type(cl, parent, running)			\ | ||||
| do {								\ | ||||
| 	struct closure *_cl = __to_internal_closure(cl);	\ | ||||
| 	_cl->type = __closure_type(*(cl));			\ | ||||
| 	do_closure_init(_cl, parent, running);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * __closure_init() - Initialize a closure, skipping the memset() | ||||
|  * | ||||
|  * May be used instead of closure_init() when memory has already been zeroed. | ||||
|  */ | ||||
| #define __closure_init(cl, parent)				\ | ||||
| 	closure_init_type(cl, parent, true) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_init() - Initialize a closure, setting the refcount to 1 | ||||
|  * @cl:		closure to initialize | ||||
|  * @parent:	parent of the new closure. cl will take a refcount on it for its | ||||
|  *		lifetime; may be NULL. | ||||
|  */ | ||||
| #define closure_init(cl, parent)				\ | ||||
| do {								\ | ||||
| 	memset((cl), 0, sizeof(*(cl)));				\ | ||||
| 	__closure_init(cl, parent);				\ | ||||
| } while (0) | ||||
| 
 | ||||
| static inline void closure_init_stack(struct closure *cl) | ||||
| { | ||||
| 	memset(cl, 0, sizeof(struct closure)); | ||||
| 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| | ||||
| 		   CLOSURE_BLOCKING|CLOSURE_STACK); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||||
|  * @cl:		closure to initialize | ||||
|  * | ||||
|  * For when the closure will be used as a lock. The closure may not be used | ||||
|  * until after a closure_lock() or closure_trylock(). | ||||
|  */ | ||||
| #define closure_init_unlocked(cl)				\ | ||||
| do {								\ | ||||
| 	memset((cl), 0, sizeof(*(cl)));				\ | ||||
| 	closure_init_type(cl, NULL, false);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_lock() - lock and initialize a closure. | ||||
|  * @cl:		the closure to lock | ||||
|  * @parent:	the new parent for this closure | ||||
|  * | ||||
|  * The closure must be of one of the types that has a waitlist (otherwise we | ||||
|  * wouldn't be able to sleep on contention). | ||||
|  * | ||||
|  * @parent has exactly the same meaning as in closure_init(); if non null, the | ||||
|  * closure will take a reference on @parent which will be released when it is | ||||
|  * unlocked. | ||||
|  */ | ||||
| #define closure_lock(cl, parent)				\ | ||||
| 	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_delay() - delay some number of jiffies | ||||
|  * @cl:		the closure that will sleep | ||||
|  * @delay:	the delay in jiffies | ||||
|  * | ||||
|  * Takes a refcount on @cl which will be released after @delay jiffies; this may | ||||
|  * be used to have a function run after a delay with continue_at(), or | ||||
|  * closure_sync() may be used for a convoluted version of msleep(). | ||||
|  */ | ||||
| #define closure_delay(cl, delay)			\ | ||||
| 	__closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) | ||||
| 
 | ||||
| #define closure_flush(cl)				\ | ||||
| 	__closure_flush(__to_internal_closure(cl), &(cl)->timer) | ||||
| 
 | ||||
| #define closure_flush_sync(cl)				\ | ||||
| 	__closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) | ||||
| 
 | ||||
| static inline void __closure_end_sleep(struct closure *cl) | ||||
| { | ||||
| 	__set_current_state(TASK_RUNNING); | ||||
| 
 | ||||
| 	if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||||
| 		atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| static inline void __closure_start_sleep(struct closure *cl) | ||||
| { | ||||
| 	closure_set_ip(cl); | ||||
| 	cl->task = current; | ||||
| 	set_current_state(TASK_UNINTERRUPTIBLE); | ||||
| 
 | ||||
| 	if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||||
| 		atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_blocking() - returns true if the closure is in blocking mode. | ||||
|  * | ||||
|  * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||||
|  * condition is true instead of waiting asynchronously. | ||||
|  */ | ||||
| static inline bool closure_blocking(struct closure *cl) | ||||
| { | ||||
| 	return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * set_closure_blocking() - put a closure in blocking mode. | ||||
|  * | ||||
|  * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||||
|  * condition is true instead of waiting asynchronously. | ||||
|  * | ||||
|  * Not thread safe - can only be called by the thread running the closure. | ||||
|  */ | ||||
| static inline void set_closure_blocking(struct closure *cl) | ||||
| { | ||||
| 	if (!closure_blocking(cl)) | ||||
| 		atomic_add(CLOSURE_BLOCKING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Not thread safe - can only be called by the thread running the closure. | ||||
|  */ | ||||
| static inline void clear_closure_blocking(struct closure *cl) | ||||
| { | ||||
| 	if (closure_blocking(cl)) | ||||
| 		atomic_sub(CLOSURE_BLOCKING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_wake_up() - wake up all closures on a wait list. | ||||
|  */ | ||||
| static inline void closure_wake_up(struct closure_waitlist *list) | ||||
| { | ||||
| 	smp_mb(); | ||||
| 	__closure_wake_up(list); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Wait on an event, synchronously or asynchronously - analogous to wait_event() | ||||
|  * but for closures. | ||||
|  * | ||||
|  * The loop is oddly structured so as to avoid a race; we must check the | ||||
|  * condition again after we've added ourself to the waitlist. We know if we were | ||||
|  * already on the waitlist because closure_wait() returns false; thus, we only | ||||
|  * schedule or break if closure_wait() returns false. If it returns true, we | ||||
|  * just loop again - rechecking the condition. | ||||
|  * | ||||
|  * The __closure_wake_up() is necessary because we may race with the event | ||||
|  * becoming true; i.e. we see event false -> wait -> recheck condition, but the | ||||
|  * thread that made the event true may have called closure_wake_up() before we | ||||
|  * added ourself to the wait list. | ||||
|  * | ||||
|  * We have to call closure_sync() at the end instead of just | ||||
|  * __closure_end_sleep() because a different thread might've called | ||||
|  * closure_wake_up() before us and gotten preempted before they dropped the | ||||
|  * refcount on our closure. If this was a stack allocated closure, that would be | ||||
|  * bad. | ||||
|  */ | ||||
| #define __closure_wait_event(list, cl, condition, _block)		\ | ||||
| ({									\ | ||||
| 	bool block = _block;						\ | ||||
| 	typeof(condition) ret;						\ | ||||
| 									\ | ||||
| 	while (1) {							\ | ||||
| 		ret = (condition);					\ | ||||
| 		if (ret) {						\ | ||||
| 			__closure_wake_up(list);			\ | ||||
| 			if (block)					\ | ||||
| 				closure_sync(cl);			\ | ||||
| 									\ | ||||
| 			break;						\ | ||||
| 		}							\ | ||||
| 									\ | ||||
| 		if (block)						\ | ||||
| 			__closure_start_sleep(cl);			\ | ||||
| 									\ | ||||
| 		if (!closure_wait(list, cl)) {				\ | ||||
| 			if (!block)					\ | ||||
| 				break;					\ | ||||
| 									\ | ||||
| 			schedule();					\ | ||||
| 		}							\ | ||||
| 	}								\ | ||||
| 									\ | ||||
| 	ret;								\ | ||||
| }) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_wait_event() - wait on a condition, synchronously or asynchronously. | ||||
|  * @list:	the wait list to wait on | ||||
|  * @cl:		the closure that is doing the waiting | ||||
|  * @condition:	a C expression for the event to wait for | ||||
|  * | ||||
|  * If the closure is in blocking mode, sleeps until the @condition evaluates to | ||||
|  * true - exactly like wait_event(). | ||||
|  * | ||||
|  * If the closure is not in blocking mode, waits asynchronously; if the | ||||
|  * condition is currently false the @cl is put onto @list and returns. @list | ||||
|  * owns a refcount on @cl; closure_sync() or continue_at() may be used later to | ||||
|  * wait for another thread to wake up @list, which drops the refcount on @cl. | ||||
|  * | ||||
|  * Returns the value of @condition; @cl will be on @list iff @condition was | ||||
|  * false. | ||||
|  * | ||||
|  * closure_wake_up(@list) must be called after changing any variable that could | ||||
|  * cause @condition to become true. | ||||
|  */ | ||||
| #define closure_wait_event(list, cl, condition)				\ | ||||
| 	__closure_wait_event(list, cl, condition, closure_blocking(cl)) | ||||
| 
 | ||||
| #define closure_wait_event_async(list, cl, condition)			\ | ||||
| 	__closure_wait_event(list, cl, condition, false) | ||||
| 
 | ||||
| #define closure_wait_event_sync(list, cl, condition)			\ | ||||
| 	__closure_wait_event(list, cl, condition, true) | ||||
| 
 | ||||
| static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||||
| 				  struct workqueue_struct *wq) | ||||
| { | ||||
| 	BUG_ON(object_is_on_stack(cl)); | ||||
| 	closure_set_ip(cl); | ||||
| 	cl->fn = fn; | ||||
| 	cl->wq = wq; | ||||
| 	/* between atomic_dec() in closure_put() */ | ||||
| 	smp_mb__before_atomic_dec(); | ||||
| } | ||||
| 
 | ||||
| #define continue_at(_cl, _fn, _wq)					\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _fn, _wq);					\ | ||||
| 	closure_sub(_cl, CLOSURE_RUNNING + 1);				\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define closure_return(_cl)	continue_at((_cl), NULL, NULL) | ||||
| 
 | ||||
| #define continue_at_nobarrier(_cl, _fn, _wq)				\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _fn, _wq);					\ | ||||
| 	closure_queue(cl);						\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define closure_return_with_destructor(_cl, _destructor)		\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _destructor, NULL);				\ | ||||
| 	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| static inline void closure_call(struct closure *cl, closure_fn fn, | ||||
| 				struct workqueue_struct *wq, | ||||
| 				struct closure *parent) | ||||
| { | ||||
| 	closure_init(cl, parent); | ||||
| 	continue_at_nobarrier(cl, fn, wq); | ||||
| } | ||||
| 
 | ||||
| static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||||
| 					struct workqueue_struct *wq, | ||||
| 					struct closure *parent) | ||||
| { | ||||
| 	if (closure_trylock(cl, parent)) | ||||
| 		continue_at_nobarrier(cl, fn, wq); | ||||
| } | ||||
| 
 | ||||
| #endif /* _LINUX_CLOSURE_H */ | ||||
							
								
								
									
										563
									
								
								drivers/md/bcache/debug.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										563
									
								
								drivers/md/bcache/debug.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,563 @@ | |||
| /*
 | ||||
|  * Assorted bcache debug code | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "request.h" | ||||
| 
 | ||||
| #include <linux/console.h> | ||||
| #include <linux/debugfs.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/random.h> | ||||
| #include <linux/seq_file.h> | ||||
| 
 | ||||
| static struct dentry *debug; | ||||
| 
 | ||||
| const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (ptr_available(c, k, i)) { | ||||
| 			struct cache *ca = PTR_CACHE(c, k, i); | ||||
| 			size_t bucket = PTR_BUCKET_NR(c, k, i); | ||||
| 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||||
| 
 | ||||
| 			if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||||
| 				return "bad, length too big"; | ||||
| 			if (bucket <  ca->sb.first_bucket) | ||||
| 				return "bad, short offset"; | ||||
| 			if (bucket >= ca->sb.nbuckets) | ||||
| 				return "bad, offset past end of device"; | ||||
| 			if (ptr_stale(c, k, i)) | ||||
| 				return "stale"; | ||||
| 		} | ||||
| 
 | ||||
| 	if (!bkey_cmp(k, &ZERO_KEY)) | ||||
| 		return "bad, null key"; | ||||
| 	if (!KEY_PTRS(k)) | ||||
| 		return "bad, no pointers"; | ||||
| 	if (!KEY_SIZE(k)) | ||||
| 		return "zeroed key"; | ||||
| 	return ""; | ||||
| } | ||||
| 
 | ||||
| struct keyprint_hack bch_pkey(const struct bkey *k) | ||||
| { | ||||
| 	unsigned i = 0; | ||||
| 	struct keyprint_hack r; | ||||
| 	char *out = r.s, *end = r.s + KEYHACK_SIZE; | ||||
| 
 | ||||
| #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__)) | ||||
| 
 | ||||
| 	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||||
| 
 | ||||
| 	if (KEY_PTRS(k)) | ||||
| 		while (1) { | ||||
| 			p("%llu:%llu gen %llu", | ||||
| 			  PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||||
| 
 | ||||
| 			if (++i == KEY_PTRS(k)) | ||||
| 				break; | ||||
| 
 | ||||
| 			p(", "); | ||||
| 		} | ||||
| 
 | ||||
| 	p("]"); | ||||
| 
 | ||||
| 	if (KEY_DIRTY(k)) | ||||
| 		p(" dirty"); | ||||
| 	if (KEY_CSUM(k)) | ||||
| 		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||||
| #undef p | ||||
| 	return r; | ||||
| } | ||||
| 
 | ||||
| struct keyprint_hack bch_pbtree(const struct btree *b) | ||||
| { | ||||
| 	struct keyprint_hack r; | ||||
| 
 | ||||
| 	snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), | ||||
| 		 b->level, b->c->root ? b->c->root->level : -1); | ||||
| 	return r; | ||||
| } | ||||
| 
 | ||||
| #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | ||||
| 
 | ||||
| static bool skipped_backwards(struct btree *b, struct bkey *k) | ||||
| { | ||||
| 	return bkey_cmp(k, (!b->level) | ||||
| 			? &START_KEY(bkey_next(k)) | ||||
| 			: bkey_next(k)) > 0; | ||||
| } | ||||
| 
 | ||||
| static void dump_bset(struct btree *b, struct bset *i) | ||||
| { | ||||
| 	struct bkey *k; | ||||
| 	unsigned j; | ||||
| 
 | ||||
| 	for (k = i->start; k < end(i); k = bkey_next(k)) { | ||||
| 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||||
| 		       (uint64_t *) k - i->d, i->keys, pkey(k)); | ||||
| 
 | ||||
| 		for (j = 0; j < KEY_PTRS(k); j++) { | ||||
| 			size_t n = PTR_BUCKET_NR(b->c, k, j); | ||||
| 			printk(" bucket %zu", n); | ||||
| 
 | ||||
| 			if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||||
| 				printk(" prio %i", | ||||
| 				       PTR_BUCKET(b->c, k, j)->prio); | ||||
| 		} | ||||
| 
 | ||||
| 		printk(" %s\n", bch_ptr_status(b->c, k)); | ||||
| 
 | ||||
| 		if (bkey_next(k) < end(i) && | ||||
| 		    skipped_backwards(b, k)) | ||||
| 			printk(KERN_ERR "Key skipped backwards\n"); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| void bch_btree_verify(struct btree *b, struct bset *new) | ||||
| { | ||||
| 	struct btree *v = b->c->verify_data; | ||||
| 	struct closure cl; | ||||
| 	closure_init_stack(&cl); | ||||
| 
 | ||||
| 	if (!b->c->verify) | ||||
| 		return; | ||||
| 
 | ||||
| 	closure_wait_event(&b->io.wait, &cl, | ||||
| 			   atomic_read(&b->io.cl.remaining) == -1); | ||||
| 
 | ||||
| 	mutex_lock(&b->c->verify_lock); | ||||
| 
 | ||||
| 	bkey_copy(&v->key, &b->key); | ||||
| 	v->written = 0; | ||||
| 	v->level = b->level; | ||||
| 
 | ||||
| 	bch_btree_read(v); | ||||
| 	closure_wait_event(&v->io.wait, &cl, | ||||
| 			   atomic_read(&b->io.cl.remaining) == -1); | ||||
| 
 | ||||
| 	if (new->keys != v->sets[0].data->keys || | ||||
| 	    memcmp(new->start, | ||||
| 		   v->sets[0].data->start, | ||||
| 		   (void *) end(new) - (void *) new->start)) { | ||||
| 		unsigned i, j; | ||||
| 
 | ||||
| 		console_lock(); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** original memory node:\n"); | ||||
| 		for (i = 0; i <= b->nsets; i++) | ||||
| 			dump_bset(b, b->sets[i].data); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** sorted memory node:\n"); | ||||
| 		dump_bset(b, new); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** on disk node:\n"); | ||||
| 		dump_bset(v, v->sets[0].data); | ||||
| 
 | ||||
| 		for (j = 0; j < new->keys; j++) | ||||
| 			if (new->d[j] != v->sets[0].data->d[j]) | ||||
| 				break; | ||||
| 
 | ||||
| 		console_unlock(); | ||||
| 		panic("verify failed at %u\n", j); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&b->c->verify_lock); | ||||
| } | ||||
| 
 | ||||
| static void data_verify_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct closure *cl = bio->bi_private; | ||||
| 	closure_put(cl); | ||||
| } | ||||
| 
 | ||||
| void bch_data_verify(struct search *s) | ||||
| { | ||||
| 	char name[BDEVNAME_SIZE]; | ||||
| 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||||
| 	struct closure *cl = &s->cl; | ||||
| 	struct bio *check; | ||||
| 	struct bio_vec *bv; | ||||
| 	int i; | ||||
| 
 | ||||
| 	if (!s->unaligned_bvec) | ||||
| 		bio_for_each_segment(bv, s->orig_bio, i) | ||||
| 			bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||||
| 
 | ||||
| 	check = bio_clone(s->orig_bio, GFP_NOIO); | ||||
| 	if (!check) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (bio_alloc_pages(check, GFP_NOIO)) | ||||
| 		goto out_put; | ||||
| 
 | ||||
| 	check->bi_rw		= READ_SYNC; | ||||
| 	check->bi_private	= cl; | ||||
| 	check->bi_end_io	= data_verify_endio; | ||||
| 
 | ||||
| 	closure_bio_submit(check, cl, &dc->disk); | ||||
| 	closure_sync(cl); | ||||
| 
 | ||||
| 	bio_for_each_segment(bv, s->orig_bio, i) { | ||||
| 		void *p1 = kmap(bv->bv_page); | ||||
| 		void *p2 = kmap(check->bi_io_vec[i].bv_page); | ||||
| 
 | ||||
| 		if (memcmp(p1 + bv->bv_offset, | ||||
| 			   p2 + bv->bv_offset, | ||||
| 			   bv->bv_len)) | ||||
| 			printk(KERN_ERR "bcache (%s): verify failed" | ||||
| 			       " at sector %llu\n", | ||||
| 			       bdevname(dc->bdev, name), | ||||
| 			       (uint64_t) s->orig_bio->bi_sector); | ||||
| 
 | ||||
| 		kunmap(bv->bv_page); | ||||
| 		kunmap(check->bi_io_vec[i].bv_page); | ||||
| 	} | ||||
| 
 | ||||
| 	__bio_for_each_segment(bv, check, i, 0) | ||||
| 		__free_page(bv->bv_page); | ||||
| out_put: | ||||
| 	bio_put(check); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_EDEBUG | ||||
| 
 | ||||
| unsigned bch_count_data(struct btree *b) | ||||
| { | ||||
| 	unsigned ret = 0; | ||||
| 	struct btree_iter iter; | ||||
| 	struct bkey *k; | ||||
| 
 | ||||
| 	if (!b->level) | ||||
| 		for_each_key(b, k, &iter) | ||||
| 			ret += KEY_SIZE(k); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | ||||
| 				   va_list args) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	console_lock(); | ||||
| 
 | ||||
| 	for (i = 0; i <= b->nsets; i++) | ||||
| 		dump_bset(b, b->sets[i].data); | ||||
| 
 | ||||
| 	vprintk(fmt, args); | ||||
| 
 | ||||
| 	console_unlock(); | ||||
| 
 | ||||
| 	panic("at %s\n", pbtree(b)); | ||||
| } | ||||
| 
 | ||||
| void bch_check_key_order_msg(struct btree *b, struct bset *i, | ||||
| 			     const char *fmt, ...) | ||||
| { | ||||
| 	struct bkey *k; | ||||
| 
 | ||||
| 	if (!i->keys) | ||||
| 		return; | ||||
| 
 | ||||
| 	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | ||||
| 		if (skipped_backwards(b, k)) { | ||||
| 			va_list args; | ||||
| 			va_start(args, fmt); | ||||
| 
 | ||||
| 			vdump_bucket_and_panic(b, fmt, args); | ||||
| 			va_end(args); | ||||
| 		} | ||||
| } | ||||
| 
 | ||||
| void bch_check_keys(struct btree *b, const char *fmt, ...) | ||||
| { | ||||
| 	va_list args; | ||||
| 	struct bkey *k, *p = NULL; | ||||
| 	struct btree_iter iter; | ||||
| 
 | ||||
| 	if (b->level) | ||||
| 		return; | ||||
| 
 | ||||
| 	for_each_key(b, k, &iter) { | ||||
| 		if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | ||||
| 			printk(KERN_ERR "Keys out of order:\n"); | ||||
| 			goto bug; | ||||
| 		} | ||||
| 
 | ||||
| 		if (bch_ptr_invalid(b, k)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | ||||
| 			printk(KERN_ERR "Overlapping keys:\n"); | ||||
| 			goto bug; | ||||
| 		} | ||||
| 		p = k; | ||||
| 	} | ||||
| 	return; | ||||
| bug: | ||||
| 	va_start(args, fmt); | ||||
| 	vdump_bucket_and_panic(b, fmt, args); | ||||
| 	va_end(args); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_DEBUG_FS | ||||
| 
 | ||||
| /* XXX: cache set refcounting */ | ||||
| 
 | ||||
| struct dump_iterator { | ||||
| 	char			buf[PAGE_SIZE]; | ||||
| 	size_t			bytes; | ||||
| 	struct cache_set	*c; | ||||
| 	struct keybuf		keys; | ||||
| }; | ||||
| 
 | ||||
| static bool dump_pred(struct keybuf *buf, struct bkey *k) | ||||
| { | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static ssize_t bch_dump_read(struct file *file, char __user *buf, | ||||
| 			     size_t size, loff_t *ppos) | ||||
| { | ||||
| 	struct dump_iterator *i = file->private_data; | ||||
| 	ssize_t ret = 0; | ||||
| 
 | ||||
| 	while (size) { | ||||
| 		struct keybuf_key *w; | ||||
| 		unsigned bytes = min(i->bytes, size); | ||||
| 
 | ||||
| 		int err = copy_to_user(buf, i->buf, bytes); | ||||
| 		if (err) | ||||
| 			return err; | ||||
| 
 | ||||
| 		ret	 += bytes; | ||||
| 		buf	 += bytes; | ||||
| 		size	 -= bytes; | ||||
| 		i->bytes -= bytes; | ||||
| 		memmove(i->buf, i->buf + bytes, i->bytes); | ||||
| 
 | ||||
| 		if (i->bytes) | ||||
| 			break; | ||||
| 
 | ||||
| 		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | ||||
| 		if (!w) | ||||
| 			break; | ||||
| 
 | ||||
| 		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); | ||||
| 		bch_keybuf_del(&i->keys, w); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int bch_dump_open(struct inode *inode, struct file *file) | ||||
| { | ||||
| 	struct cache_set *c = inode->i_private; | ||||
| 	struct dump_iterator *i; | ||||
| 
 | ||||
| 	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); | ||||
| 	if (!i) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	file->private_data = i; | ||||
| 	i->c = c; | ||||
| 	bch_keybuf_init(&i->keys, dump_pred); | ||||
| 	i->keys.last_scanned = KEY(0, 0, 0); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int bch_dump_release(struct inode *inode, struct file *file) | ||||
| { | ||||
| 	kfree(file->private_data); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static const struct file_operations cache_set_debug_ops = { | ||||
| 	.owner		= THIS_MODULE, | ||||
| 	.open		= bch_dump_open, | ||||
| 	.read		= bch_dump_read, | ||||
| 	.release	= bch_dump_release | ||||
| }; | ||||
| 
 | ||||
| void bch_debug_init_cache_set(struct cache_set *c) | ||||
| { | ||||
| 	if (!IS_ERR_OR_NULL(debug)) { | ||||
| 		char name[50]; | ||||
| 		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); | ||||
| 
 | ||||
| 		c->debug = debugfs_create_file(name, 0400, debug, c, | ||||
| 					       &cache_set_debug_ops); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | ||||
| 			  const char *buffer, size_t size) | ||||
| { | ||||
| 	void dump(struct btree *b) | ||||
| 	{ | ||||
| 		struct bset *i; | ||||
| 
 | ||||
| 		for (i = b->sets[0].data; | ||||
| 		     index(i, b) < btree_blocks(b) && | ||||
| 		     i->seq == b->sets[0].data->seq; | ||||
| 		     i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | ||||
| 			dump_bset(b, i); | ||||
| 	} | ||||
| 
 | ||||
| 	struct cache_sb *sb; | ||||
| 	struct cache_set *c; | ||||
| 	struct btree *all[3], *b, *fill, *orig; | ||||
| 	int j; | ||||
| 
 | ||||
| 	struct btree_op op; | ||||
| 	bch_btree_op_init_stack(&op); | ||||
| 
 | ||||
| 	sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||||
| 	if (!sb) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	sb->bucket_size = 128; | ||||
| 	sb->block_size = 4; | ||||
| 
 | ||||
| 	c = bch_cache_set_alloc(sb); | ||||
| 	if (!c) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	for (j = 0; j < 3; j++) { | ||||
| 		BUG_ON(list_empty(&c->btree_cache)); | ||||
| 		all[j] = list_first_entry(&c->btree_cache, struct btree, list); | ||||
| 		list_del_init(&all[j]->list); | ||||
| 
 | ||||
| 		all[j]->key = KEY(0, 0, c->sb.bucket_size); | ||||
| 		bkey_copy_key(&all[j]->key, &MAX_KEY); | ||||
| 	} | ||||
| 
 | ||||
| 	b = all[0]; | ||||
| 	fill = all[1]; | ||||
| 	orig = all[2]; | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		for (j = 0; j < 3; j++) | ||||
| 			all[j]->written = all[j]->nsets = 0; | ||||
| 
 | ||||
| 		bch_bset_init_next(b); | ||||
| 
 | ||||
| 		while (1) { | ||||
| 			struct bset *i = write_block(b); | ||||
| 			struct bkey *k = op.keys.top; | ||||
| 			unsigned rand; | ||||
| 
 | ||||
| 			bkey_init(k); | ||||
| 			rand = get_random_int(); | ||||
| 
 | ||||
| 			op.type = rand & 1 | ||||
| 				? BTREE_INSERT | ||||
| 				: BTREE_REPLACE; | ||||
| 			rand >>= 1; | ||||
| 
 | ||||
| 			SET_KEY_SIZE(k, bucket_remainder(c, rand)); | ||||
| 			rand >>= c->bucket_bits; | ||||
| 			rand &= 1024 * 512 - 1; | ||||
| 			rand += c->sb.bucket_size; | ||||
| 			SET_KEY_OFFSET(k, rand); | ||||
| #if 0 | ||||
| 			SET_KEY_PTRS(k, 1); | ||||
| #endif | ||||
| 			bch_keylist_push(&op.keys); | ||||
| 			bch_btree_insert_keys(b, &op); | ||||
| 
 | ||||
| 			if (should_split(b) || | ||||
| 			    set_blocks(i, b->c) != | ||||
| 			    __set_blocks(i, i->keys + 15, b->c)) { | ||||
| 				i->csum = csum_set(i); | ||||
| 
 | ||||
| 				memcpy(write_block(fill), | ||||
| 				       i, set_bytes(i)); | ||||
| 
 | ||||
| 				b->written += set_blocks(i, b->c); | ||||
| 				fill->written = b->written; | ||||
| 				if (b->written == btree_blocks(b)) | ||||
| 					break; | ||||
| 
 | ||||
| 				bch_btree_sort_lazy(b); | ||||
| 				bch_bset_init_next(b); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		memcpy(orig->sets[0].data, | ||||
| 		       fill->sets[0].data, | ||||
| 		       btree_bytes(c)); | ||||
| 
 | ||||
| 		bch_btree_sort(b); | ||||
| 		fill->written = 0; | ||||
| 		bch_btree_read_done(&fill->io.cl); | ||||
| 
 | ||||
| 		if (b->sets[0].data->keys != fill->sets[0].data->keys || | ||||
| 		    memcmp(b->sets[0].data->start, | ||||
| 			   fill->sets[0].data->start, | ||||
| 			   b->sets[0].data->keys * sizeof(uint64_t))) { | ||||
| 			struct bset *i = b->sets[0].data; | ||||
| 			struct bkey *k, *l; | ||||
| 
 | ||||
| 			for (k = i->start, | ||||
| 			     l = fill->sets[0].data->start; | ||||
| 			     k < end(i); | ||||
| 			     k = bkey_next(k), l = bkey_next(l)) | ||||
| 				if (bkey_cmp(k, l) || | ||||
| 				    KEY_SIZE(k) != KEY_SIZE(l)) | ||||
| 					pr_err("key %zi differs: %s " | ||||
| 					       "!= %s", (uint64_t *) k - i->d, | ||||
| 					       pkey(k), pkey(l)); | ||||
| 
 | ||||
| 			for (j = 0; j < 3; j++) { | ||||
| 				pr_err("**** Set %i ****", j); | ||||
| 				dump(all[j]); | ||||
| 			} | ||||
| 			panic("\n"); | ||||
| 		} | ||||
| 
 | ||||
| 		pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| kobj_attribute_write(fuzz, btree_fuzz); | ||||
| #endif | ||||
| 
 | ||||
| void bch_debug_exit(void) | ||||
| { | ||||
| 	if (!IS_ERR_OR_NULL(debug)) | ||||
| 		debugfs_remove_recursive(debug); | ||||
| } | ||||
| 
 | ||||
| int __init bch_debug_init(struct kobject *kobj) | ||||
| { | ||||
| 	int ret = 0; | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| #endif | ||||
| 
 | ||||
| 	debug = debugfs_create_dir("bcache", NULL); | ||||
| 	return ret; | ||||
| } | ||||
							
								
								
									
										54
									
								
								drivers/md/bcache/debug.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								drivers/md/bcache/debug.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| #ifndef _BCACHE_DEBUG_H | ||||
| #define _BCACHE_DEBUG_H | ||||
| 
 | ||||
| /* Btree/bkey debug printing */ | ||||
| 
 | ||||
| #define KEYHACK_SIZE 80 | ||||
| struct keyprint_hack { | ||||
| 	char s[KEYHACK_SIZE]; | ||||
| }; | ||||
| 
 | ||||
| struct keyprint_hack bch_pkey(const struct bkey *k); | ||||
| struct keyprint_hack bch_pbtree(const struct btree *b); | ||||
| #define pkey(k)		(&bch_pkey(k).s[0]) | ||||
| #define pbtree(b)	(&bch_pbtree(b).s[0]) | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_EDEBUG | ||||
| 
 | ||||
| unsigned bch_count_data(struct btree *); | ||||
| void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); | ||||
| void bch_check_keys(struct btree *, const char *, ...); | ||||
| 
 | ||||
| #define bch_check_key_order(b, i)			\ | ||||
| 	bch_check_key_order_msg(b, i, "keys out of order") | ||||
| #define EBUG_ON(cond)		BUG_ON(cond) | ||||
| 
 | ||||
| #else /* EDEBUG */ | ||||
| 
 | ||||
| #define bch_count_data(b)				0 | ||||
| #define bch_check_key_order(b, i)			do {} while (0) | ||||
| #define bch_check_key_order_msg(b, i, ...)		do {} while (0) | ||||
| #define bch_check_keys(b, ...)				do {} while (0) | ||||
| #define EBUG_ON(cond)					do {} while (0) | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| void bch_btree_verify(struct btree *, struct bset *); | ||||
| void bch_data_verify(struct search *); | ||||
| 
 | ||||
| #else /* DEBUG */ | ||||
| 
 | ||||
| static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | ||||
| static inline void bch_data_verify(struct search *s) {}; | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_DEBUG_FS | ||||
| void bch_debug_init_cache_set(struct cache_set *); | ||||
| #else | ||||
| static inline void bch_debug_init_cache_set(struct cache_set *c) {} | ||||
| #endif | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										390
									
								
								drivers/md/bcache/io.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										390
									
								
								drivers/md/bcache/io.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,390 @@ | |||
| /*
 | ||||
|  * Some low level IO code, and hacks for various block layer limitations | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "bset.h" | ||||
| #include "debug.h" | ||||
| 
 | ||||
| static void bch_bi_idx_hack_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct bio *p = bio->bi_private; | ||||
| 
 | ||||
| 	bio_endio(p, error); | ||||
| 	bio_put(bio); | ||||
| } | ||||
| 
 | ||||
| static void bch_generic_make_request_hack(struct bio *bio) | ||||
| { | ||||
| 	if (bio->bi_idx) { | ||||
| 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | ||||
| 
 | ||||
| 		memcpy(clone->bi_io_vec, | ||||
| 		       bio_iovec(bio), | ||||
| 		       bio_segments(bio) * sizeof(struct bio_vec)); | ||||
| 
 | ||||
| 		clone->bi_sector	= bio->bi_sector; | ||||
| 		clone->bi_bdev		= bio->bi_bdev; | ||||
| 		clone->bi_rw		= bio->bi_rw; | ||||
| 		clone->bi_vcnt		= bio_segments(bio); | ||||
| 		clone->bi_size		= bio->bi_size; | ||||
| 
 | ||||
| 		clone->bi_private	= bio; | ||||
| 		clone->bi_end_io	= bch_bi_idx_hack_endio; | ||||
| 
 | ||||
| 		bio = clone; | ||||
| 	} | ||||
| 
 | ||||
| 	generic_make_request(bio); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * bch_bio_split - split a bio | ||||
|  * @bio:	bio to split | ||||
|  * @sectors:	number of sectors to split from the front of @bio | ||||
|  * @gfp:	gfp mask | ||||
|  * @bs:		bio set to allocate from | ||||
|  * | ||||
|  * Allocates and returns a new bio which represents @sectors from the start of | ||||
|  * @bio, and updates @bio to represent the remaining sectors. | ||||
|  * | ||||
|  * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio | ||||
|  * unchanged. | ||||
|  * | ||||
|  * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | ||||
|  * bvec boundry; it is the caller's responsibility to ensure that @bio is not | ||||
|  * freed before the split. | ||||
|  * | ||||
|  * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||||
|  * allocate more than one bio from the same bio set. Therefore, if it is running | ||||
|  * under generic_make_request() it masks out __GFP_WAIT when doing the | ||||
|  * allocation. The caller must check for failure if there's any possibility of | ||||
|  * it being called from under generic_make_request(); it is then the caller's | ||||
|  * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||||
|  */ | ||||
| struct bio *bch_bio_split(struct bio *bio, int sectors, | ||||
| 			  gfp_t gfp, struct bio_set *bs) | ||||
| { | ||||
| 	unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; | ||||
| 	struct bio_vec *bv; | ||||
| 	struct bio *ret = NULL; | ||||
| 
 | ||||
| 	BUG_ON(sectors <= 0); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we're being called from underneath generic_make_request() and we | ||||
| 	 * already allocated any bios from this bio set, we risk deadlock if we | ||||
| 	 * use the mempool. So instead, we possibly fail and let the caller punt | ||||
| 	 * to workqueue or somesuch and retry in a safe context. | ||||
| 	 */ | ||||
| 	if (current->bio_list) | ||||
| 		gfp &= ~__GFP_WAIT; | ||||
| 
 | ||||
| 	if (sectors >= bio_sectors(bio)) | ||||
| 		return bio; | ||||
| 
 | ||||
| 	if (bio->bi_rw & REQ_DISCARD) { | ||||
| 		ret = bio_alloc_bioset(gfp, 1, bs); | ||||
| 		idx = 0; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	bio_for_each_segment(bv, bio, idx) { | ||||
| 		vcnt = idx - bio->bi_idx; | ||||
| 
 | ||||
| 		if (!nbytes) { | ||||
| 			ret = bio_alloc_bioset(gfp, vcnt, bs); | ||||
| 			if (!ret) | ||||
| 				return NULL; | ||||
| 
 | ||||
| 			memcpy(ret->bi_io_vec, bio_iovec(bio), | ||||
| 			       sizeof(struct bio_vec) * vcnt); | ||||
| 
 | ||||
| 			break; | ||||
| 		} else if (nbytes < bv->bv_len) { | ||||
| 			ret = bio_alloc_bioset(gfp, ++vcnt, bs); | ||||
| 			if (!ret) | ||||
| 				return NULL; | ||||
| 
 | ||||
| 			memcpy(ret->bi_io_vec, bio_iovec(bio), | ||||
| 			       sizeof(struct bio_vec) * vcnt); | ||||
| 
 | ||||
| 			ret->bi_io_vec[vcnt - 1].bv_len = nbytes; | ||||
| 			bv->bv_offset	+= nbytes; | ||||
| 			bv->bv_len	-= nbytes; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		nbytes -= bv->bv_len; | ||||
| 	} | ||||
| out: | ||||
| 	ret->bi_bdev	= bio->bi_bdev; | ||||
| 	ret->bi_sector	= bio->bi_sector; | ||||
| 	ret->bi_size	= sectors << 9; | ||||
| 	ret->bi_rw	= bio->bi_rw; | ||||
| 	ret->bi_vcnt	= vcnt; | ||||
| 	ret->bi_max_vecs = vcnt; | ||||
| 
 | ||||
| 	bio->bi_sector	+= sectors; | ||||
| 	bio->bi_size	-= sectors << 9; | ||||
| 	bio->bi_idx	 = idx; | ||||
| 
 | ||||
| 	if (bio_integrity(bio)) { | ||||
| 		if (bio_integrity_clone(ret, bio, gfp)) { | ||||
| 			bio_put(ret); | ||||
| 			return NULL; | ||||
| 		} | ||||
| 
 | ||||
| 		bio_integrity_trim(ret, 0, bio_sectors(ret)); | ||||
| 		bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static unsigned bch_bio_max_sectors(struct bio *bio) | ||||
| { | ||||
| 	unsigned ret = bio_sectors(bio); | ||||
| 	struct request_queue *q = bdev_get_queue(bio->bi_bdev); | ||||
| 	struct bio_vec *bv, *end = bio_iovec(bio) + | ||||
| 		min_t(int, bio_segments(bio), queue_max_segments(q)); | ||||
| 
 | ||||
| 	struct bvec_merge_data bvm = { | ||||
| 		.bi_bdev	= bio->bi_bdev, | ||||
| 		.bi_sector	= bio->bi_sector, | ||||
| 		.bi_size	= 0, | ||||
| 		.bi_rw		= bio->bi_rw, | ||||
| 	}; | ||||
| 
 | ||||
| 	if (bio->bi_rw & REQ_DISCARD) | ||||
| 		return min(ret, q->limits.max_discard_sectors); | ||||
| 
 | ||||
| 	if (bio_segments(bio) > queue_max_segments(q) || | ||||
| 	    q->merge_bvec_fn) { | ||||
| 		ret = 0; | ||||
| 
 | ||||
| 		for (bv = bio_iovec(bio); bv < end; bv++) { | ||||
| 			if (q->merge_bvec_fn && | ||||
| 			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | ||||
| 				break; | ||||
| 
 | ||||
| 			ret		+= bv->bv_len >> 9; | ||||
| 			bvm.bi_size	+= bv->bv_len; | ||||
| 		} | ||||
| 
 | ||||
| 		if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9) | ||||
| 			return (BIO_MAX_PAGES * PAGE_SIZE) >> 9; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = min(ret, queue_max_sectors(q)); | ||||
| 
 | ||||
| 	WARN_ON(!ret); | ||||
| 	ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static void bch_bio_submit_split_done(struct closure *cl) | ||||
| { | ||||
| 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||||
| 
 | ||||
| 	s->bio->bi_end_io = s->bi_end_io; | ||||
| 	s->bio->bi_private = s->bi_private; | ||||
| 	bio_endio(s->bio, 0); | ||||
| 
 | ||||
| 	closure_debug_destroy(&s->cl); | ||||
| 	mempool_free(s, s->p->bio_split_hook); | ||||
| } | ||||
| 
 | ||||
| static void bch_bio_submit_split_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct closure *cl = bio->bi_private; | ||||
| 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||||
| 
 | ||||
| 	if (error) | ||||
| 		clear_bit(BIO_UPTODATE, &s->bio->bi_flags); | ||||
| 
 | ||||
| 	bio_put(bio); | ||||
| 	closure_put(cl); | ||||
| } | ||||
| 
 | ||||
| static void __bch_bio_submit_split(struct closure *cl) | ||||
| { | ||||
| 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||||
| 	struct bio *bio = s->bio, *n; | ||||
| 
 | ||||
| 	do { | ||||
| 		n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||||
| 				  GFP_NOIO, s->p->bio_split); | ||||
| 		if (!n) | ||||
| 			continue_at(cl, __bch_bio_submit_split, system_wq); | ||||
| 
 | ||||
| 		n->bi_end_io	= bch_bio_submit_split_endio; | ||||
| 		n->bi_private	= cl; | ||||
| 
 | ||||
| 		closure_get(cl); | ||||
| 		bch_generic_make_request_hack(n); | ||||
| 	} while (n != bio); | ||||
| 
 | ||||
| 	continue_at(cl, bch_bio_submit_split_done, NULL); | ||||
| } | ||||
| 
 | ||||
| void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | ||||
| { | ||||
| 	struct bio_split_hook *s; | ||||
| 
 | ||||
| 	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | ||||
| 		goto submit; | ||||
| 
 | ||||
| 	if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) | ||||
| 		goto submit; | ||||
| 
 | ||||
| 	s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | ||||
| 
 | ||||
| 	s->bio		= bio; | ||||
| 	s->p		= p; | ||||
| 	s->bi_end_io	= bio->bi_end_io; | ||||
| 	s->bi_private	= bio->bi_private; | ||||
| 	bio_get(bio); | ||||
| 
 | ||||
| 	closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | ||||
| 	return; | ||||
| submit: | ||||
| 	bch_generic_make_request_hack(bio); | ||||
| } | ||||
| 
 | ||||
| /* Bios with headers */ | ||||
| 
 | ||||
| void bch_bbio_free(struct bio *bio, struct cache_set *c) | ||||
| { | ||||
| 	struct bbio *b = container_of(bio, struct bbio, bio); | ||||
| 	mempool_free(b, c->bio_meta); | ||||
| } | ||||
| 
 | ||||
| struct bio *bch_bbio_alloc(struct cache_set *c) | ||||
| { | ||||
| 	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); | ||||
| 	struct bio *bio = &b->bio; | ||||
| 
 | ||||
| 	bio_init(bio); | ||||
| 	bio->bi_flags		|= BIO_POOL_NONE << BIO_POOL_OFFSET; | ||||
| 	bio->bi_max_vecs	 = bucket_pages(c); | ||||
| 	bio->bi_io_vec		 = bio->bi_inline_vecs; | ||||
| 
 | ||||
| 	return bio; | ||||
| } | ||||
| 
 | ||||
| void __bch_submit_bbio(struct bio *bio, struct cache_set *c) | ||||
| { | ||||
| 	struct bbio *b = container_of(bio, struct bbio, bio); | ||||
| 
 | ||||
| 	bio->bi_sector	= PTR_OFFSET(&b->key, 0); | ||||
| 	bio->bi_bdev	= PTR_CACHE(c, &b->key, 0)->bdev; | ||||
| 
 | ||||
| 	b->submit_time_us = local_clock_us(); | ||||
| 	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); | ||||
| } | ||||
| 
 | ||||
| void bch_submit_bbio(struct bio *bio, struct cache_set *c, | ||||
| 		     struct bkey *k, unsigned ptr) | ||||
| { | ||||
| 	struct bbio *b = container_of(bio, struct bbio, bio); | ||||
| 	bch_bkey_copy_single_ptr(&b->key, k, ptr); | ||||
| 	__bch_submit_bbio(bio, c); | ||||
| } | ||||
| 
 | ||||
| /* IO errors */ | ||||
| 
 | ||||
| void bch_count_io_errors(struct cache *ca, int error, const char *m) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * The halflife of an error is: | ||||
| 	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh | ||||
| 	 */ | ||||
| 
 | ||||
| 	if (ca->set->error_decay) { | ||||
| 		unsigned count = atomic_inc_return(&ca->io_count); | ||||
| 
 | ||||
| 		while (count > ca->set->error_decay) { | ||||
| 			unsigned errors; | ||||
| 			unsigned old = count; | ||||
| 			unsigned new = count - ca->set->error_decay; | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * First we subtract refresh from count; each time we | ||||
| 			 * succesfully do so, we rescale the errors once: | ||||
| 			 */ | ||||
| 
 | ||||
| 			count = atomic_cmpxchg(&ca->io_count, old, new); | ||||
| 
 | ||||
| 			if (count == old) { | ||||
| 				count = new; | ||||
| 
 | ||||
| 				errors = atomic_read(&ca->io_errors); | ||||
| 				do { | ||||
| 					old = errors; | ||||
| 					new = ((uint64_t) errors * 127) / 128; | ||||
| 					errors = atomic_cmpxchg(&ca->io_errors, | ||||
| 								old, new); | ||||
| 				} while (old != errors); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (error) { | ||||
| 		char buf[BDEVNAME_SIZE]; | ||||
| 		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, | ||||
| 						    &ca->io_errors); | ||||
| 		errors >>= IO_ERROR_SHIFT; | ||||
| 
 | ||||
| 		if (errors < ca->set->error_limit) | ||||
| 			pr_err("%s: IO error on %s, recovering", | ||||
| 			       bdevname(ca->bdev, buf), m); | ||||
| 		else | ||||
| 			bch_cache_set_error(ca->set, | ||||
| 					    "%s: too many IO errors %s", | ||||
| 					    bdevname(ca->bdev, buf), m); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | ||||
| 			      int error, const char *m) | ||||
| { | ||||
| 	struct bbio *b = container_of(bio, struct bbio, bio); | ||||
| 	struct cache *ca = PTR_CACHE(c, &b->key, 0); | ||||
| 
 | ||||
| 	unsigned threshold = bio->bi_rw & REQ_WRITE | ||||
| 		? c->congested_write_threshold_us | ||||
| 		: c->congested_read_threshold_us; | ||||
| 
 | ||||
| 	if (threshold) { | ||||
| 		unsigned t = local_clock_us(); | ||||
| 
 | ||||
| 		int us = t - b->submit_time_us; | ||||
| 		int congested = atomic_read(&c->congested); | ||||
| 
 | ||||
| 		if (us > (int) threshold) { | ||||
| 			int ms = us / 1024; | ||||
| 			c->congested_last_us = t; | ||||
| 
 | ||||
| 			ms = min(ms, CONGESTED_MAX + congested); | ||||
| 			atomic_sub(ms, &c->congested); | ||||
| 		} else if (congested < 0) | ||||
| 			atomic_inc(&c->congested); | ||||
| 	} | ||||
| 
 | ||||
| 	bch_count_io_errors(ca, error, m); | ||||
| } | ||||
| 
 | ||||
| void bch_bbio_endio(struct cache_set *c, struct bio *bio, | ||||
| 		    int error, const char *m) | ||||
| { | ||||
| 	struct closure *cl = bio->bi_private; | ||||
| 
 | ||||
| 	bch_bbio_count_io_errors(c, bio, error, m); | ||||
| 	bio_put(bio); | ||||
| 	closure_put(cl); | ||||
| } | ||||
							
								
								
									
										785
									
								
								drivers/md/bcache/journal.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										785
									
								
								drivers/md/bcache/journal.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,785 @@ | |||
| /*
 | ||||
|  * bcache journalling code, for btree insertions | ||||
|  * | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "request.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * Journal replay/recovery: | ||||
|  * | ||||
|  * This code is all driven from run_cache_set(); we first read the journal | ||||
|  * entries, do some other stuff, then we mark all the keys in the journal | ||||
|  * entries (same as garbage collection would), then we replay them - reinserting | ||||
|  * them into the cache in precisely the same order as they appear in the | ||||
|  * journal. | ||||
|  * | ||||
|  * We only journal keys that go in leaf nodes, which simplifies things quite a | ||||
|  * bit. | ||||
|  */ | ||||
| 
 | ||||
| static void journal_read_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct closure *cl = bio->bi_private; | ||||
| 	closure_put(cl); | ||||
| } | ||||
| 
 | ||||
| static int journal_read_bucket(struct cache *ca, struct list_head *list, | ||||
| 			       struct btree_op *op, unsigned bucket_index) | ||||
| { | ||||
| 	struct journal_device *ja = &ca->journal; | ||||
| 	struct bio *bio = &ja->bio; | ||||
| 
 | ||||
| 	struct journal_replay *i; | ||||
| 	struct jset *j, *data = ca->set->journal.w[0].data; | ||||
| 	unsigned len, left, offset = 0; | ||||
| 	int ret = 0; | ||||
| 	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | ||||
| 
 | ||||
| 	pr_debug("reading %llu", (uint64_t) bucket); | ||||
| 
 | ||||
| 	while (offset < ca->sb.bucket_size) { | ||||
| reread:		left = ca->sb.bucket_size - offset; | ||||
| 		len = min_t(unsigned, left, PAGE_SECTORS * 8); | ||||
| 
 | ||||
| 		bio_reset(bio); | ||||
| 		bio->bi_sector	= bucket + offset; | ||||
| 		bio->bi_bdev	= ca->bdev; | ||||
| 		bio->bi_rw	= READ; | ||||
| 		bio->bi_size	= len << 9; | ||||
| 
 | ||||
| 		bio->bi_end_io	= journal_read_endio; | ||||
| 		bio->bi_private = &op->cl; | ||||
| 		bio_map(bio, data); | ||||
| 
 | ||||
| 		closure_bio_submit(bio, &op->cl, ca); | ||||
| 		closure_sync(&op->cl); | ||||
| 
 | ||||
| 		/* This function could be simpler now since we no longer write
 | ||||
| 		 * journal entries that overlap bucket boundaries; this means | ||||
| 		 * the start of a bucket will always have a valid journal entry | ||||
| 		 * if it has any journal entries at all. | ||||
| 		 */ | ||||
| 
 | ||||
| 		j = data; | ||||
| 		while (len) { | ||||
| 			struct list_head *where; | ||||
| 			size_t blocks, bytes = set_bytes(j); | ||||
| 
 | ||||
| 			if (j->magic != jset_magic(ca->set)) | ||||
| 				return ret; | ||||
| 
 | ||||
| 			if (bytes > left << 9) | ||||
| 				return ret; | ||||
| 
 | ||||
| 			if (bytes > len << 9) | ||||
| 				goto reread; | ||||
| 
 | ||||
| 			if (j->csum != csum_set(j)) | ||||
| 				return ret; | ||||
| 
 | ||||
| 			blocks = set_blocks(j, ca->set); | ||||
| 
 | ||||
| 			while (!list_empty(list)) { | ||||
| 				i = list_first_entry(list, | ||||
| 					struct journal_replay, list); | ||||
| 				if (i->j.seq >= j->last_seq) | ||||
| 					break; | ||||
| 				list_del(&i->list); | ||||
| 				kfree(i); | ||||
| 			} | ||||
| 
 | ||||
| 			list_for_each_entry_reverse(i, list, list) { | ||||
| 				if (j->seq == i->j.seq) | ||||
| 					goto next_set; | ||||
| 
 | ||||
| 				if (j->seq < i->j.last_seq) | ||||
| 					goto next_set; | ||||
| 
 | ||||
| 				if (j->seq > i->j.seq) { | ||||
| 					where = &i->list; | ||||
| 					goto add; | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			where = list; | ||||
| add: | ||||
| 			i = kmalloc(offsetof(struct journal_replay, j) + | ||||
| 				    bytes, GFP_KERNEL); | ||||
| 			if (!i) | ||||
| 				return -ENOMEM; | ||||
| 			memcpy(&i->j, j, bytes); | ||||
| 			list_add(&i->list, where); | ||||
| 			ret = 1; | ||||
| 
 | ||||
| 			ja->seq[bucket_index] = j->seq; | ||||
| next_set: | ||||
| 			offset	+= blocks * ca->sb.block_size; | ||||
| 			len	-= blocks * ca->sb.block_size; | ||||
| 			j = ((void *) j) + blocks * block_bytes(ca); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int bch_journal_read(struct cache_set *c, struct list_head *list, | ||||
| 			struct btree_op *op) | ||||
| { | ||||
| #define read_bucket(b)							\ | ||||
| 	({								\ | ||||
| 		int ret = journal_read_bucket(ca, list, op, b);		\ | ||||
| 		__set_bit(b, bitmap);					\ | ||||
| 		if (ret < 0)						\ | ||||
| 			return ret;					\ | ||||
| 		ret;							\ | ||||
| 	}) | ||||
| 
 | ||||
| 	struct cache *ca; | ||||
| 	unsigned iter; | ||||
| 
 | ||||
| 	for_each_cache(ca, c, iter) { | ||||
| 		struct journal_device *ja = &ca->journal; | ||||
| 		unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; | ||||
| 		unsigned i, l, r, m; | ||||
| 		uint64_t seq; | ||||
| 
 | ||||
| 		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); | ||||
| 		pr_debug("%u journal buckets", ca->sb.njournal_buckets); | ||||
| 
 | ||||
| 		/* Read journal buckets ordered by golden ratio hash to quickly
 | ||||
| 		 * find a sequence of buckets with valid journal entries | ||||
| 		 */ | ||||
| 		for (i = 0; i < ca->sb.njournal_buckets; i++) { | ||||
| 			l = (i * 2654435769U) % ca->sb.njournal_buckets; | ||||
| 
 | ||||
| 			if (test_bit(l, bitmap)) | ||||
| 				break; | ||||
| 
 | ||||
| 			if (read_bucket(l)) | ||||
| 				goto bsearch; | ||||
| 		} | ||||
| 
 | ||||
| 		/* If that fails, check all the buckets we haven't checked
 | ||||
| 		 * already | ||||
| 		 */ | ||||
| 		pr_debug("falling back to linear search"); | ||||
| 
 | ||||
| 		for (l = 0; l < ca->sb.njournal_buckets; l++) { | ||||
| 			if (test_bit(l, bitmap)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if (read_bucket(l)) | ||||
| 				goto bsearch; | ||||
| 		} | ||||
| bsearch: | ||||
| 		/* Binary search */ | ||||
| 		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); | ||||
| 		pr_debug("starting binary search, l %u r %u", l, r); | ||||
| 
 | ||||
| 		while (l + 1 < r) { | ||||
| 			m = (l + r) >> 1; | ||||
| 
 | ||||
| 			if (read_bucket(m)) | ||||
| 				l = m; | ||||
| 			else | ||||
| 				r = m; | ||||
| 		} | ||||
| 
 | ||||
| 		/* Read buckets in reverse order until we stop finding more
 | ||||
| 		 * journal entries | ||||
| 		 */ | ||||
| 		pr_debug("finishing up"); | ||||
| 		l = m; | ||||
| 
 | ||||
| 		while (1) { | ||||
| 			if (!l--) | ||||
| 				l = ca->sb.njournal_buckets - 1; | ||||
| 
 | ||||
| 			if (l == m) | ||||
| 				break; | ||||
| 
 | ||||
| 			if (test_bit(l, bitmap)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if (!read_bucket(l)) | ||||
| 				break; | ||||
| 		} | ||||
| 
 | ||||
| 		seq = 0; | ||||
| 
 | ||||
| 		for (i = 0; i < ca->sb.njournal_buckets; i++) | ||||
| 			if (ja->seq[i] > seq) { | ||||
| 				seq = ja->seq[i]; | ||||
| 				ja->cur_idx = ja->discard_idx = | ||||
| 					ja->last_idx = i; | ||||
| 
 | ||||
| 			} | ||||
| 	} | ||||
| 
 | ||||
| 	c->journal.seq = list_entry(list->prev, | ||||
| 				    struct journal_replay, | ||||
| 				    list)->j.seq; | ||||
| 
 | ||||
| 	return 0; | ||||
| #undef read_bucket | ||||
| } | ||||
| 
 | ||||
| void bch_journal_mark(struct cache_set *c, struct list_head *list) | ||||
| { | ||||
| 	atomic_t p = { 0 }; | ||||
| 	struct bkey *k; | ||||
| 	struct journal_replay *i; | ||||
| 	struct journal *j = &c->journal; | ||||
| 	uint64_t last = j->seq; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * journal.pin should never fill up - we never write a journal | ||||
| 	 * entry when it would fill up. But if for some reason it does, we | ||||
| 	 * iterate over the list in reverse order so that we can just skip that | ||||
| 	 * refcount instead of bugging. | ||||
| 	 */ | ||||
| 
 | ||||
| 	list_for_each_entry_reverse(i, list, list) { | ||||
| 		BUG_ON(last < i->j.seq); | ||||
| 		i->pin = NULL; | ||||
| 
 | ||||
| 		while (last-- != i->j.seq) | ||||
| 			if (fifo_free(&j->pin) > 1) { | ||||
| 				fifo_push_front(&j->pin, p); | ||||
| 				atomic_set(&fifo_front(&j->pin), 0); | ||||
| 			} | ||||
| 
 | ||||
| 		if (fifo_free(&j->pin) > 1) { | ||||
| 			fifo_push_front(&j->pin, p); | ||||
| 			i->pin = &fifo_front(&j->pin); | ||||
| 			atomic_set(i->pin, 1); | ||||
| 		} | ||||
| 
 | ||||
| 		for (k = i->j.start; | ||||
| 		     k < end(&i->j); | ||||
| 		     k = bkey_next(k)) { | ||||
| 			unsigned j; | ||||
| 
 | ||||
| 			for (j = 0; j < KEY_PTRS(k); j++) { | ||||
| 				struct bucket *g = PTR_BUCKET(c, k, j); | ||||
| 				atomic_inc(&g->pin); | ||||
| 
 | ||||
| 				if (g->prio == BTREE_PRIO && | ||||
| 				    !ptr_stale(c, k, j)) | ||||
| 					g->prio = INITIAL_PRIO; | ||||
| 			} | ||||
| 
 | ||||
| 			__bch_btree_mark_key(c, 0, k); | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int bch_journal_replay(struct cache_set *s, struct list_head *list, | ||||
| 			  struct btree_op *op) | ||||
| { | ||||
| 	int ret = 0, keys = 0, entries = 0; | ||||
| 	struct bkey *k; | ||||
| 	struct journal_replay *i = | ||||
| 		list_entry(list->prev, struct journal_replay, list); | ||||
| 
 | ||||
| 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | ||||
| 
 | ||||
| 	list_for_each_entry(i, list, list) { | ||||
| 		BUG_ON(i->pin && atomic_read(i->pin) != 1); | ||||
| 
 | ||||
| 		if (n != i->j.seq) | ||||
| 			pr_err("journal entries %llu-%llu " | ||||
| 			       "missing! (replaying %llu-%llu)\n", | ||||
| 			       n, i->j.seq - 1, start, end); | ||||
| 
 | ||||
| 		for (k = i->j.start; | ||||
| 		     k < end(&i->j); | ||||
| 		     k = bkey_next(k)) { | ||||
| 			pr_debug("%s", pkey(k)); | ||||
| 			bkey_copy(op->keys.top, k); | ||||
| 			bch_keylist_push(&op->keys); | ||||
| 
 | ||||
| 			op->journal = i->pin; | ||||
| 			atomic_inc(op->journal); | ||||
| 
 | ||||
| 			ret = bch_btree_insert(op, s); | ||||
| 			if (ret) | ||||
| 				goto err; | ||||
| 
 | ||||
| 			BUG_ON(!bch_keylist_empty(&op->keys)); | ||||
| 			keys++; | ||||
| 
 | ||||
| 			cond_resched(); | ||||
| 		} | ||||
| 
 | ||||
| 		if (i->pin) | ||||
| 			atomic_dec(i->pin); | ||||
| 		n = i->j.seq + 1; | ||||
| 		entries++; | ||||
| 	} | ||||
| 
 | ||||
| 	pr_info("journal replay done, %i keys in %i entries, seq %llu", | ||||
| 		keys, entries, end); | ||||
| 
 | ||||
| 	while (!list_empty(list)) { | ||||
| 		i = list_first_entry(list, struct journal_replay, list); | ||||
| 		list_del(&i->list); | ||||
| 		kfree(i); | ||||
| 	} | ||||
| err: | ||||
| 	closure_sync(&op->cl); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /* Journalling */ | ||||
| 
 | ||||
| static void btree_flush_write(struct cache_set *c) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Try to find the btree node with that references the oldest journal | ||||
| 	 * entry, best is our current candidate and is locked if non NULL: | ||||
| 	 */ | ||||
| 	struct btree *b, *best = NULL; | ||||
| 	unsigned iter; | ||||
| 
 | ||||
| 	for_each_cached_btree(b, c, iter) { | ||||
| 		if (!down_write_trylock(&b->lock)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (!btree_node_dirty(b) || | ||||
| 		    !btree_current_write(b)->journal) { | ||||
| 			rw_unlock(true, b); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!best) | ||||
| 			best = b; | ||||
| 		else if (journal_pin_cmp(c, | ||||
| 					 btree_current_write(best), | ||||
| 					 btree_current_write(b))) { | ||||
| 			rw_unlock(true, best); | ||||
| 			best = b; | ||||
| 		} else | ||||
| 			rw_unlock(true, b); | ||||
| 	} | ||||
| 
 | ||||
| 	if (best) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/* We can't find the best btree node, just pick the first */ | ||||
| 	list_for_each_entry(b, &c->btree_cache, list) | ||||
| 		if (!b->level && btree_node_dirty(b)) { | ||||
| 			best = b; | ||||
| 			rw_lock(true, best, best->level); | ||||
| 			goto found; | ||||
| 		} | ||||
| 
 | ||||
| out: | ||||
| 	if (!best) | ||||
| 		return; | ||||
| found: | ||||
| 	if (btree_node_dirty(best)) | ||||
| 		bch_btree_write(best, true, NULL); | ||||
| 	rw_unlock(true, best); | ||||
| } | ||||
| 
 | ||||
| #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1) | ||||
| 
 | ||||
| static void journal_discard_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct journal_device *ja = | ||||
| 		container_of(bio, struct journal_device, discard_bio); | ||||
| 	struct cache *ca = container_of(ja, struct cache, journal); | ||||
| 
 | ||||
| 	atomic_set(&ja->discard_in_flight, DISCARD_DONE); | ||||
| 
 | ||||
| 	closure_wake_up(&ca->set->journal.wait); | ||||
| 	closure_put(&ca->set->cl); | ||||
| } | ||||
| 
 | ||||
| static void journal_discard_work(struct work_struct *work) | ||||
| { | ||||
| 	struct journal_device *ja = | ||||
| 		container_of(work, struct journal_device, discard_work); | ||||
| 
 | ||||
| 	submit_bio(0, &ja->discard_bio); | ||||
| } | ||||
| 
 | ||||
| static void do_journal_discard(struct cache *ca) | ||||
| { | ||||
| 	struct journal_device *ja = &ca->journal; | ||||
| 	struct bio *bio = &ja->discard_bio; | ||||
| 
 | ||||
| 	if (!ca->discard) { | ||||
| 		ja->discard_idx = ja->last_idx; | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { | ||||
| 	case DISCARD_IN_FLIGHT: | ||||
| 		return; | ||||
| 
 | ||||
| 	case DISCARD_DONE: | ||||
| 		ja->discard_idx = (ja->discard_idx + 1) % | ||||
| 			ca->sb.njournal_buckets; | ||||
| 
 | ||||
| 		atomic_set(&ja->discard_in_flight, DISCARD_READY); | ||||
| 		/* fallthrough */ | ||||
| 
 | ||||
| 	case DISCARD_READY: | ||||
| 		if (ja->discard_idx == ja->last_idx) | ||||
| 			return; | ||||
| 
 | ||||
| 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | ||||
| 
 | ||||
| 		bio_init(bio); | ||||
| 		bio->bi_sector		= bucket_to_sector(ca->set, | ||||
| 							   ca->sb.d[ja->discard_idx]); | ||||
| 		bio->bi_bdev		= ca->bdev; | ||||
| 		bio->bi_rw		= REQ_WRITE|REQ_DISCARD; | ||||
| 		bio->bi_max_vecs	= 1; | ||||
| 		bio->bi_io_vec		= bio->bi_inline_vecs; | ||||
| 		bio->bi_size		= bucket_bytes(ca); | ||||
| 		bio->bi_end_io		= journal_discard_endio; | ||||
| 
 | ||||
| 		closure_get(&ca->set->cl); | ||||
| 		INIT_WORK(&ja->discard_work, journal_discard_work); | ||||
| 		schedule_work(&ja->discard_work); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void journal_reclaim(struct cache_set *c) | ||||
| { | ||||
| 	struct bkey *k = &c->journal.key; | ||||
| 	struct cache *ca; | ||||
| 	uint64_t last_seq; | ||||
| 	unsigned iter, n = 0; | ||||
| 	atomic_t p; | ||||
| 
 | ||||
| 	while (!atomic_read(&fifo_front(&c->journal.pin))) | ||||
| 		fifo_pop(&c->journal.pin, p); | ||||
| 
 | ||||
| 	last_seq = last_seq(&c->journal); | ||||
| 
 | ||||
| 	/* Update last_idx */ | ||||
| 
 | ||||
| 	for_each_cache(ca, c, iter) { | ||||
| 		struct journal_device *ja = &ca->journal; | ||||
| 
 | ||||
| 		while (ja->last_idx != ja->cur_idx && | ||||
| 		       ja->seq[ja->last_idx] < last_seq) | ||||
| 			ja->last_idx = (ja->last_idx + 1) % | ||||
| 				ca->sb.njournal_buckets; | ||||
| 	} | ||||
| 
 | ||||
| 	for_each_cache(ca, c, iter) | ||||
| 		do_journal_discard(ca); | ||||
| 
 | ||||
| 	if (c->journal.blocks_free) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Allocate: | ||||
| 	 * XXX: Sort by free journal space | ||||
| 	 */ | ||||
| 
 | ||||
| 	for_each_cache(ca, c, iter) { | ||||
| 		struct journal_device *ja = &ca->journal; | ||||
| 		unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; | ||||
| 
 | ||||
| 		/* No space available on this device */ | ||||
| 		if (next == ja->discard_idx) | ||||
| 			continue; | ||||
| 
 | ||||
| 		ja->cur_idx = next; | ||||
| 		k->ptr[n++] = PTR(0, | ||||
| 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]), | ||||
| 				  ca->sb.nr_this_dev); | ||||
| 	} | ||||
| 
 | ||||
| 	bkey_init(k); | ||||
| 	SET_KEY_PTRS(k, n); | ||||
| 
 | ||||
| 	if (n) | ||||
| 		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; | ||||
| 
 | ||||
| 	if (!journal_full(&c->journal)) | ||||
| 		__closure_wake_up(&c->journal.wait); | ||||
| } | ||||
| 
 | ||||
| void bch_journal_next(struct journal *j) | ||||
| { | ||||
| 	atomic_t p = { 1 }; | ||||
| 
 | ||||
| 	j->cur = (j->cur == j->w) | ||||
| 		? &j->w[1] | ||||
| 		: &j->w[0]; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The fifo_push() needs to happen at the same time as j->seq is | ||||
| 	 * incremented for last_seq() to be calculated correctly | ||||
| 	 */ | ||||
| 	BUG_ON(!fifo_push(&j->pin, p)); | ||||
| 	atomic_set(&fifo_back(&j->pin), 1); | ||||
| 
 | ||||
| 	j->cur->data->seq	= ++j->seq; | ||||
| 	j->cur->need_write	= false; | ||||
| 	j->cur->data->keys	= 0; | ||||
| 
 | ||||
| 	if (fifo_full(&j->pin)) | ||||
| 		pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); | ||||
| } | ||||
| 
 | ||||
| static void journal_write_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct journal_write *w = bio->bi_private; | ||||
| 
 | ||||
| 	cache_set_err_on(error, w->c, "journal io error"); | ||||
| 	closure_put(&w->c->journal.io.cl); | ||||
| } | ||||
| 
 | ||||
| static void journal_write(struct closure *); | ||||
| 
 | ||||
| static void journal_write_done(struct closure *cl) | ||||
| { | ||||
| 	struct journal *j = container_of(cl, struct journal, io.cl); | ||||
| 	struct cache_set *c = container_of(j, struct cache_set, journal); | ||||
| 
 | ||||
| 	struct journal_write *w = (j->cur == j->w) | ||||
| 		? &j->w[1] | ||||
| 		: &j->w[0]; | ||||
| 
 | ||||
| 	__closure_wake_up(&w->wait); | ||||
| 
 | ||||
| 	if (c->journal_delay_ms) | ||||
| 		closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); | ||||
| 
 | ||||
| 	continue_at(cl, journal_write, system_wq); | ||||
| } | ||||
| 
 | ||||
| static void journal_write_unlocked(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||||
| 	struct cache *ca; | ||||
| 	struct journal_write *w = c->journal.cur; | ||||
| 	struct bkey *k = &c->journal.key; | ||||
| 	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | ||||
| 
 | ||||
| 	struct bio *bio; | ||||
| 	struct bio_list list; | ||||
| 	bio_list_init(&list); | ||||
| 
 | ||||
| 	if (!w->need_write) { | ||||
| 		/*
 | ||||
| 		 * XXX: have to unlock closure before we unlock journal lock, | ||||
| 		 * else we race with bch_journal(). But this way we race | ||||
| 		 * against cache set unregister. Doh. | ||||
| 		 */ | ||||
| 		set_closure_fn(cl, NULL, NULL); | ||||
| 		closure_sub(cl, CLOSURE_RUNNING + 1); | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 		return; | ||||
| 	} else if (journal_full(&c->journal)) { | ||||
| 		journal_reclaim(c); | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 
 | ||||
| 		btree_flush_write(c); | ||||
| 		continue_at(cl, journal_write, system_wq); | ||||
| 	} | ||||
| 
 | ||||
| 	c->journal.blocks_free -= set_blocks(w->data, c); | ||||
| 
 | ||||
| 	w->data->btree_level = c->root->level; | ||||
| 
 | ||||
| 	bkey_copy(&w->data->btree_root, &c->root->key); | ||||
| 	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); | ||||
| 
 | ||||
| 	for_each_cache(ca, c, i) | ||||
| 		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; | ||||
| 
 | ||||
| 	w->data->magic		= jset_magic(c); | ||||
| 	w->data->version	= BCACHE_JSET_VERSION; | ||||
| 	w->data->last_seq	= last_seq(&c->journal); | ||||
| 	w->data->csum		= csum_set(w->data); | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		ca = PTR_CACHE(c, k, i); | ||||
| 		bio = &ca->journal.bio; | ||||
| 
 | ||||
| 		atomic_long_add(sectors, &ca->meta_sectors_written); | ||||
| 
 | ||||
| 		bio_reset(bio); | ||||
| 		bio->bi_sector	= PTR_OFFSET(k, i); | ||||
| 		bio->bi_bdev	= ca->bdev; | ||||
| 		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | ||||
| 		bio->bi_size	= sectors << 9; | ||||
| 
 | ||||
| 		bio->bi_end_io	= journal_write_endio; | ||||
| 		bio->bi_private = w; | ||||
| 		bio_map(bio, w->data); | ||||
| 
 | ||||
| 		trace_bcache_journal_write(bio); | ||||
| 		bio_list_add(&list, bio); | ||||
| 
 | ||||
| 		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); | ||||
| 
 | ||||
| 		ca->journal.seq[ca->journal.cur_idx] = w->data->seq; | ||||
| 	} | ||||
| 
 | ||||
| 	atomic_dec_bug(&fifo_back(&c->journal.pin)); | ||||
| 	bch_journal_next(&c->journal); | ||||
| 	journal_reclaim(c); | ||||
| 
 | ||||
| 	spin_unlock(&c->journal.lock); | ||||
| 
 | ||||
| 	while ((bio = bio_list_pop(&list))) | ||||
| 		closure_bio_submit(bio, cl, c->cache[0]); | ||||
| 
 | ||||
| 	continue_at(cl, journal_write_done, NULL); | ||||
| } | ||||
| 
 | ||||
| static void journal_write(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||||
| 
 | ||||
| 	spin_lock(&c->journal.lock); | ||||
| 	journal_write_unlocked(cl); | ||||
| } | ||||
| 
 | ||||
| static void __journal_try_write(struct cache_set *c, bool noflush) | ||||
| { | ||||
| 	struct closure *cl = &c->journal.io.cl; | ||||
| 
 | ||||
| 	if (!closure_trylock(cl, &c->cl)) | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 	else if (noflush && journal_full(&c->journal)) { | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 		continue_at(cl, journal_write, system_wq); | ||||
| 	} else | ||||
| 		journal_write_unlocked(cl); | ||||
| } | ||||
| 
 | ||||
| #define journal_try_write(c)	__journal_try_write(c, false) | ||||
| 
 | ||||
| void bch_journal_meta(struct cache_set *c, struct closure *cl) | ||||
| { | ||||
| 	struct journal_write *w; | ||||
| 
 | ||||
| 	if (CACHE_SYNC(&c->sb)) { | ||||
| 		spin_lock(&c->journal.lock); | ||||
| 
 | ||||
| 		w = c->journal.cur; | ||||
| 		w->need_write = true; | ||||
| 
 | ||||
| 		if (cl) | ||||
| 			BUG_ON(!closure_wait(&w->wait, cl)); | ||||
| 
 | ||||
| 		__journal_try_write(c, true); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Entry point to the journalling code - bio_insert() and btree_invalidate() | ||||
|  * pass bch_journal() a list of keys to be journalled, and then | ||||
|  * bch_journal() hands those same keys off to btree_insert_async() | ||||
|  */ | ||||
| 
 | ||||
| void bch_journal(struct closure *cl) | ||||
| { | ||||
| 	struct btree_op *op = container_of(cl, struct btree_op, cl); | ||||
| 	struct cache_set *c = op->c; | ||||
| 	struct journal_write *w; | ||||
| 	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; | ||||
| 
 | ||||
| 	if (op->type != BTREE_INSERT || | ||||
| 	    !CACHE_SYNC(&c->sb)) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we're looping because we errored, might already be waiting on | ||||
| 	 * another journal write: | ||||
| 	 */ | ||||
| 	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) | ||||
| 		closure_sync(cl->parent); | ||||
| 
 | ||||
| 	spin_lock(&c->journal.lock); | ||||
| 
 | ||||
| 	if (journal_full(&c->journal)) { | ||||
| 		/* XXX: tracepoint */ | ||||
| 		closure_wait(&c->journal.wait, cl); | ||||
| 
 | ||||
| 		journal_reclaim(c); | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 
 | ||||
| 		btree_flush_write(c); | ||||
| 		continue_at(cl, bch_journal, bcache_wq); | ||||
| 	} | ||||
| 
 | ||||
| 	w = c->journal.cur; | ||||
| 	w->need_write = true; | ||||
| 	b = __set_blocks(w->data, w->data->keys + n, c); | ||||
| 
 | ||||
| 	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | ||||
| 	    b > c->journal.blocks_free) { | ||||
| 		/* XXX: If we were inserting so many keys that they won't fit in
 | ||||
| 		 * an _empty_ journal write, we'll deadlock. For now, handle | ||||
| 		 * this in bch_keylist_realloc() - but something to think about. | ||||
| 		 */ | ||||
| 		BUG_ON(!w->data->keys); | ||||
| 
 | ||||
| 		/* XXX: tracepoint */ | ||||
| 		BUG_ON(!closure_wait(&w->wait, cl)); | ||||
| 
 | ||||
| 		closure_flush(&c->journal.io); | ||||
| 
 | ||||
| 		journal_try_write(c); | ||||
| 		continue_at(cl, bch_journal, bcache_wq); | ||||
| 	} | ||||
| 
 | ||||
| 	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); | ||||
| 	w->data->keys += n; | ||||
| 
 | ||||
| 	op->journal = &fifo_back(&c->journal.pin); | ||||
| 	atomic_inc(op->journal); | ||||
| 
 | ||||
| 	if (op->flush_journal) { | ||||
| 		closure_flush(&c->journal.io); | ||||
| 		closure_wait(&w->wait, cl->parent); | ||||
| 	} | ||||
| 
 | ||||
| 	journal_try_write(c); | ||||
| out: | ||||
| 	bch_btree_insert_async(cl); | ||||
| } | ||||
| 
 | ||||
| void bch_journal_free(struct cache_set *c) | ||||
| { | ||||
| 	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | ||||
| 	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | ||||
| 	free_fifo(&c->journal.pin); | ||||
| } | ||||
| 
 | ||||
| int bch_journal_alloc(struct cache_set *c) | ||||
| { | ||||
| 	struct journal *j = &c->journal; | ||||
| 
 | ||||
| 	closure_init_unlocked(&j->io); | ||||
| 	spin_lock_init(&j->lock); | ||||
| 
 | ||||
| 	c->journal_delay_ms = 100; | ||||
| 
 | ||||
| 	j->w[0].c = c; | ||||
| 	j->w[1].c = c; | ||||
| 
 | ||||
| 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||||
| 	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | ||||
| 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
							
								
								
									
										215
									
								
								drivers/md/bcache/journal.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										215
									
								
								drivers/md/bcache/journal.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,215 @@ | |||
| #ifndef _BCACHE_JOURNAL_H | ||||
| #define _BCACHE_JOURNAL_H | ||||
| 
 | ||||
| /*
 | ||||
|  * THE JOURNAL: | ||||
|  * | ||||
|  * The journal is treated as a circular buffer of buckets - a journal entry | ||||
|  * never spans two buckets. This means (not implemented yet) we can resize the | ||||
|  * journal at runtime, and will be needed for bcache on raw flash support. | ||||
|  * | ||||
|  * Journal entries contain a list of keys, ordered by the time they were | ||||
|  * inserted; thus journal replay just has to reinsert the keys. | ||||
|  * | ||||
|  * We also keep some things in the journal header that are logically part of the | ||||
|  * superblock - all the things that are frequently updated. This is for future | ||||
|  * bcache on raw flash support; the superblock (which will become another | ||||
|  * journal) can't be moved or wear leveled, so it contains just enough | ||||
|  * information to find the main journal, and the superblock only has to be | ||||
|  * rewritten when we want to move/wear level the main journal. | ||||
|  * | ||||
|  * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be | ||||
|  * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions | ||||
|  * from cache misses, which don't have to be journaled, and for writeback and | ||||
|  * moving gc we work around it by flushing the btree to disk before updating the | ||||
|  * gc information. But it is a potential issue with incremental garbage | ||||
|  * collection, and it's fragile. | ||||
|  * | ||||
|  * OPEN JOURNAL ENTRIES: | ||||
|  * | ||||
|  * Each journal entry contains, in the header, the sequence number of the last | ||||
|  * journal entry still open - i.e. that has keys that haven't been flushed to | ||||
|  * disk in the btree. | ||||
|  * | ||||
|  * We track this by maintaining a refcount for every open journal entry, in a | ||||
|  * fifo; each entry in the fifo corresponds to a particular journal | ||||
|  * entry/sequence number. When the refcount at the tail of the fifo goes to | ||||
|  * zero, we pop it off - thus, the size of the fifo tells us the number of open | ||||
|  * journal entries | ||||
|  * | ||||
|  * We take a refcount on a journal entry when we add some keys to a journal | ||||
|  * entry that we're going to insert (held by struct btree_op), and then when we | ||||
|  * insert those keys into the btree the btree write we're setting up takes a | ||||
|  * copy of that refcount (held by struct btree_write). That refcount is dropped | ||||
|  * when the btree write completes. | ||||
|  * | ||||
|  * A struct btree_write can only hold a refcount on a single journal entry, but | ||||
|  * might contain keys for many journal entries - we handle this by making sure | ||||
|  * it always has a refcount on the _oldest_ journal entry of all the journal | ||||
|  * entries it has keys for. | ||||
|  * | ||||
|  * JOURNAL RECLAIM: | ||||
|  * | ||||
|  * As mentioned previously, our fifo of refcounts tells us the number of open | ||||
|  * journal entries; from that and the current journal sequence number we compute | ||||
|  * last_seq - the oldest journal entry we still need. We write last_seq in each | ||||
|  * journal entry, and we also have to keep track of where it exists on disk so | ||||
|  * we don't overwrite it when we loop around the journal. | ||||
|  * | ||||
|  * To do that we track, for each journal bucket, the sequence number of the | ||||
|  * newest journal entry it contains - if we don't need that journal entry we | ||||
|  * don't need anything in that bucket anymore. From that we track the last | ||||
|  * journal bucket we still need; all this is tracked in struct journal_device | ||||
|  * and updated by journal_reclaim(). | ||||
|  * | ||||
|  * JOURNAL FILLING UP: | ||||
|  * | ||||
|  * There are two ways the journal could fill up; either we could run out of | ||||
|  * space to write to, or we could have too many open journal entries and run out | ||||
|  * of room in the fifo of refcounts. Since those refcounts are decremented | ||||
|  * without any locking we can't safely resize that fifo, so we handle it the | ||||
|  * same way. | ||||
|  * | ||||
|  * If the journal fills up, we start flushing dirty btree nodes until we can | ||||
|  * allocate space for a journal write again - preferentially flushing btree | ||||
|  * nodes that are pinning the oldest journal entries first. | ||||
|  */ | ||||
| 
 | ||||
| #define BCACHE_JSET_VERSION_UUIDv1	1 | ||||
| /* Always latest UUID format */ | ||||
| #define BCACHE_JSET_VERSION_UUID	1 | ||||
| #define BCACHE_JSET_VERSION		1 | ||||
| 
 | ||||
| /*
 | ||||
|  * On disk format for a journal entry: | ||||
|  * seq is monotonically increasing; every journal entry has its own unique | ||||
|  * sequence number. | ||||
|  * | ||||
|  * last_seq is the oldest journal entry that still has keys the btree hasn't | ||||
|  * flushed to disk yet. | ||||
|  * | ||||
|  * version is for on disk format changes. | ||||
|  */ | ||||
| struct jset { | ||||
| 	uint64_t		csum; | ||||
| 	uint64_t		magic; | ||||
| 	uint64_t		seq; | ||||
| 	uint32_t		version; | ||||
| 	uint32_t		keys; | ||||
| 
 | ||||
| 	uint64_t		last_seq; | ||||
| 
 | ||||
| 	BKEY_PADDED(uuid_bucket); | ||||
| 	BKEY_PADDED(btree_root); | ||||
| 	uint16_t		btree_level; | ||||
| 	uint16_t		pad[3]; | ||||
| 
 | ||||
| 	uint64_t		prio_bucket[MAX_CACHES_PER_SET]; | ||||
| 
 | ||||
| 	union { | ||||
| 		struct bkey	start[0]; | ||||
| 		uint64_t	d[0]; | ||||
| 	}; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Only used for holding the journal entries we read in btree_journal_read() | ||||
|  * during cache_registration | ||||
|  */ | ||||
| struct journal_replay { | ||||
| 	struct list_head	list; | ||||
| 	atomic_t		*pin; | ||||
| 	struct jset		j; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * We put two of these in struct journal; we used them for writes to the | ||||
|  * journal that are being staged or in flight. | ||||
|  */ | ||||
| struct journal_write { | ||||
| 	struct jset		*data; | ||||
| #define JSET_BITS		3 | ||||
| 
 | ||||
| 	struct cache_set	*c; | ||||
| 	struct closure_waitlist	wait; | ||||
| 	bool			need_write; | ||||
| }; | ||||
| 
 | ||||
| /* Embedded in struct cache_set */ | ||||
| struct journal { | ||||
| 	spinlock_t		lock; | ||||
| 	/* used when waiting because the journal was full */ | ||||
| 	struct closure_waitlist	wait; | ||||
| 	struct closure_with_timer io; | ||||
| 
 | ||||
| 	/* Number of blocks free in the bucket(s) we're currently writing to */ | ||||
| 	unsigned		blocks_free; | ||||
| 	uint64_t		seq; | ||||
| 	DECLARE_FIFO(atomic_t, pin); | ||||
| 
 | ||||
| 	BKEY_PADDED(key); | ||||
| 
 | ||||
| 	struct journal_write	w[2], *cur; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Embedded in struct cache. First three fields refer to the array of journal | ||||
|  * buckets, in cache_sb. | ||||
|  */ | ||||
| struct journal_device { | ||||
| 	/*
 | ||||
| 	 * For each journal bucket, contains the max sequence number of the | ||||
| 	 * journal writes it contains - so we know when a bucket can be reused. | ||||
| 	 */ | ||||
| 	uint64_t		seq[SB_JOURNAL_BUCKETS]; | ||||
| 
 | ||||
| 	/* Journal bucket we're currently writing to */ | ||||
| 	unsigned		cur_idx; | ||||
| 
 | ||||
| 	/* Last journal bucket that still contains an open journal entry */ | ||||
| 	unsigned		last_idx; | ||||
| 
 | ||||
| 	/* Next journal bucket to be discarded */ | ||||
| 	unsigned		discard_idx; | ||||
| 
 | ||||
| #define DISCARD_READY		0 | ||||
| #define DISCARD_IN_FLIGHT	1 | ||||
| #define DISCARD_DONE		2 | ||||
| 	/* 1 - discard in flight, -1 - discard completed */ | ||||
| 	atomic_t		discard_in_flight; | ||||
| 
 | ||||
| 	struct work_struct	discard_work; | ||||
| 	struct bio		discard_bio; | ||||
| 	struct bio_vec		discard_bv; | ||||
| 
 | ||||
| 	/* Bio for journal reads/writes to this device */ | ||||
| 	struct bio		bio; | ||||
| 	struct bio_vec		bv[8]; | ||||
| }; | ||||
| 
 | ||||
| #define journal_pin_cmp(c, l, r)				\ | ||||
| 	(fifo_idx(&(c)->journal.pin, (l)->journal) >		\ | ||||
| 	 fifo_idx(&(c)->journal.pin, (r)->journal)) | ||||
| 
 | ||||
| #define JOURNAL_PIN	20000 | ||||
| 
 | ||||
| #define journal_full(j)						\ | ||||
| 	(!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) | ||||
| 
 | ||||
| struct closure; | ||||
| struct cache_set; | ||||
| struct btree_op; | ||||
| 
 | ||||
| void bch_journal(struct closure *); | ||||
| void bch_journal_next(struct journal *); | ||||
| void bch_journal_mark(struct cache_set *, struct list_head *); | ||||
| void bch_journal_meta(struct cache_set *, struct closure *); | ||||
| int bch_journal_read(struct cache_set *, struct list_head *, | ||||
| 			struct btree_op *); | ||||
| int bch_journal_replay(struct cache_set *, struct list_head *, | ||||
| 			  struct btree_op *); | ||||
| 
 | ||||
| void bch_journal_free(struct cache_set *); | ||||
| int bch_journal_alloc(struct cache_set *); | ||||
| 
 | ||||
| #endif /* _BCACHE_JOURNAL_H */ | ||||
							
								
								
									
										254
									
								
								drivers/md/bcache/movinggc.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										254
									
								
								drivers/md/bcache/movinggc.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,254 @@ | |||
| /*
 | ||||
|  * Moving/copying garbage collector | ||||
|  * | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "request.h" | ||||
| 
 | ||||
| struct moving_io { | ||||
| 	struct keybuf_key	*w; | ||||
| 	struct search		s; | ||||
| 	struct bbio		bio; | ||||
| }; | ||||
| 
 | ||||
| static bool moving_pred(struct keybuf *buf, struct bkey *k) | ||||
| { | ||||
| 	struct cache_set *c = container_of(buf, struct cache_set, | ||||
| 					   moving_gc_keys); | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		struct cache *ca = PTR_CACHE(c, k, i); | ||||
| 		struct bucket *g = PTR_BUCKET(c, k, i); | ||||
| 
 | ||||
| 		if (GC_SECTORS_USED(g) < ca->gc_move_threshold) | ||||
| 			return true; | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /* Moving GC - IO loop */ | ||||
| 
 | ||||
| static void moving_io_destructor(struct closure *cl) | ||||
| { | ||||
| 	struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||||
| 	kfree(io); | ||||
| } | ||||
| 
 | ||||
| static void write_moving_finish(struct closure *cl) | ||||
| { | ||||
| 	struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||||
| 	struct bio *bio = &io->bio.bio; | ||||
| 	struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | ||||
| 
 | ||||
| 	while (bv-- != bio->bi_io_vec) | ||||
| 		__free_page(bv->bv_page); | ||||
| 
 | ||||
| 	pr_debug("%s %s", io->s.op.insert_collision | ||||
| 		 ? "collision moving" : "moved", | ||||
| 		 pkey(&io->w->key)); | ||||
| 
 | ||||
| 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | ||||
| 
 | ||||
| 	atomic_dec_bug(&io->s.op.c->in_flight); | ||||
| 	closure_wake_up(&io->s.op.c->moving_gc_wait); | ||||
| 
 | ||||
| 	closure_return_with_destructor(cl, moving_io_destructor); | ||||
| } | ||||
| 
 | ||||
| static void read_moving_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct moving_io *io = container_of(bio->bi_private, | ||||
| 					    struct moving_io, s.cl); | ||||
| 
 | ||||
| 	if (error) | ||||
| 		io->s.error = error; | ||||
| 
 | ||||
| 	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | ||||
| } | ||||
| 
 | ||||
| static void moving_init(struct moving_io *io) | ||||
| { | ||||
| 	struct bio *bio = &io->bio.bio; | ||||
| 
 | ||||
| 	bio_init(bio); | ||||
| 	bio_get(bio); | ||||
| 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||||
| 
 | ||||
| 	bio->bi_size		= KEY_SIZE(&io->w->key) << 9; | ||||
| 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key), | ||||
| 					       PAGE_SECTORS); | ||||
| 	bio->bi_private		= &io->s.cl; | ||||
| 	bio->bi_io_vec		= bio->bi_inline_vecs; | ||||
| 	bio_map(bio, NULL); | ||||
| } | ||||
| 
 | ||||
| static void write_moving(struct closure *cl) | ||||
| { | ||||
| 	struct search *s = container_of(cl, struct search, cl); | ||||
| 	struct moving_io *io = container_of(s, struct moving_io, s); | ||||
| 
 | ||||
| 	if (!s->error) { | ||||
| 		trace_bcache_write_moving(&io->bio.bio); | ||||
| 
 | ||||
| 		moving_init(io); | ||||
| 
 | ||||
| 		io->bio.bio.bi_sector	= KEY_START(&io->w->key); | ||||
| 		s->op.lock		= -1; | ||||
| 		s->op.write_prio	= 1; | ||||
| 		s->op.cache_bio		= &io->bio.bio; | ||||
| 
 | ||||
| 		s->writeback		= KEY_DIRTY(&io->w->key); | ||||
| 		s->op.csum		= KEY_CSUM(&io->w->key); | ||||
| 
 | ||||
| 		s->op.type = BTREE_REPLACE; | ||||
| 		bkey_copy(&s->op.replace, &io->w->key); | ||||
| 
 | ||||
| 		closure_init(&s->op.cl, cl); | ||||
| 		bch_insert_data(&s->op.cl); | ||||
| 	} | ||||
| 
 | ||||
| 	continue_at(cl, write_moving_finish, NULL); | ||||
| } | ||||
| 
 | ||||
| static void read_moving_submit(struct closure *cl) | ||||
| { | ||||
| 	struct search *s = container_of(cl, struct search, cl); | ||||
| 	struct moving_io *io = container_of(s, struct moving_io, s); | ||||
| 	struct bio *bio = &io->bio.bio; | ||||
| 
 | ||||
| 	trace_bcache_read_moving(bio); | ||||
| 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | ||||
| 
 | ||||
| 	continue_at(cl, write_moving, bch_gc_wq); | ||||
| } | ||||
| 
 | ||||
| static void read_moving(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | ||||
| 	struct keybuf_key *w; | ||||
| 	struct moving_io *io; | ||||
| 	struct bio *bio; | ||||
| 
 | ||||
| 	/* XXX: if we error, background writeback could stall indefinitely */ | ||||
| 
 | ||||
| 	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | ||||
| 		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | ||||
| 		if (!w) | ||||
| 			break; | ||||
| 
 | ||||
| 		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) | ||||
| 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||||
| 			     GFP_KERNEL); | ||||
| 		if (!io) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		w->private	= io; | ||||
| 		io->w		= w; | ||||
| 		io->s.op.inode	= KEY_INODE(&w->key); | ||||
| 		io->s.op.c	= c; | ||||
| 
 | ||||
| 		moving_init(io); | ||||
| 		bio = &io->bio.bio; | ||||
| 
 | ||||
| 		bio->bi_rw	= READ; | ||||
| 		bio->bi_end_io	= read_moving_endio; | ||||
| 
 | ||||
| 		if (bio_alloc_pages(bio, GFP_KERNEL)) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		pr_debug("%s", pkey(&w->key)); | ||||
| 
 | ||||
| 		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | ||||
| 
 | ||||
| 		if (atomic_inc_return(&c->in_flight) >= 64) { | ||||
| 			closure_wait_event(&c->moving_gc_wait, cl, | ||||
| 					   atomic_read(&c->in_flight) < 64); | ||||
| 			continue_at(cl, read_moving, bch_gc_wq); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (0) { | ||||
| err:		if (!IS_ERR_OR_NULL(w->private)) | ||||
| 			kfree(w->private); | ||||
| 
 | ||||
| 		bch_keybuf_del(&c->moving_gc_keys, w); | ||||
| 	} | ||||
| 
 | ||||
| 	closure_return(cl); | ||||
| } | ||||
| 
 | ||||
| void bch_moving_gc(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||||
| 	struct cache *ca; | ||||
| 	struct bucket *b; | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	bool bucket_cmp(struct bucket *l, struct bucket *r) | ||||
| 	{ | ||||
| 		return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); | ||||
| 	} | ||||
| 
 | ||||
| 	unsigned top(struct cache *ca) | ||||
| 	{ | ||||
| 		return GC_SECTORS_USED(heap_peek(&ca->heap)); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!c->copy_gc_enabled) | ||||
| 		closure_return(cl); | ||||
| 
 | ||||
| 	mutex_lock(&c->bucket_lock); | ||||
| 
 | ||||
| 	for_each_cache(ca, c, i) { | ||||
| 		unsigned sectors_to_move = 0; | ||||
| 		unsigned reserve_sectors = ca->sb.bucket_size * | ||||
| 			min(fifo_used(&ca->free), ca->free.size / 2); | ||||
| 
 | ||||
| 		ca->heap.used = 0; | ||||
| 
 | ||||
| 		for_each_bucket(b, ca) { | ||||
| 			if (!GC_SECTORS_USED(b)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if (!heap_full(&ca->heap)) { | ||||
| 				sectors_to_move += GC_SECTORS_USED(b); | ||||
| 				heap_add(&ca->heap, b, bucket_cmp); | ||||
| 			} else if (bucket_cmp(b, heap_peek(&ca->heap))) { | ||||
| 				sectors_to_move -= top(ca); | ||||
| 				sectors_to_move += GC_SECTORS_USED(b); | ||||
| 
 | ||||
| 				ca->heap.data[0] = b; | ||||
| 				heap_sift(&ca->heap, 0, bucket_cmp); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		while (sectors_to_move > reserve_sectors) { | ||||
| 			heap_pop(&ca->heap, b, bucket_cmp); | ||||
| 			sectors_to_move -= GC_SECTORS_USED(b); | ||||
| 		} | ||||
| 
 | ||||
| 		ca->gc_move_threshold = top(ca); | ||||
| 
 | ||||
| 		pr_debug("threshold %u", ca->gc_move_threshold); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&c->bucket_lock); | ||||
| 
 | ||||
| 	c->moving_gc_keys.last_scanned = ZERO_KEY; | ||||
| 
 | ||||
| 	closure_init(&c->moving_gc, cl); | ||||
| 	read_moving(&c->moving_gc); | ||||
| 
 | ||||
| 	closure_return(cl); | ||||
| } | ||||
| 
 | ||||
| void bch_moving_init_cache_set(struct cache_set *c) | ||||
| { | ||||
| 	bch_keybuf_init(&c->moving_gc_keys, moving_pred); | ||||
| } | ||||
							
								
								
									
										1409
									
								
								drivers/md/bcache/request.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1409
									
								
								drivers/md/bcache/request.c
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										62
									
								
								drivers/md/bcache/request.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								drivers/md/bcache/request.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,62 @@ | |||
| #ifndef _BCACHE_REQUEST_H_ | ||||
| #define _BCACHE_REQUEST_H_ | ||||
| 
 | ||||
| #include <linux/cgroup.h> | ||||
| 
 | ||||
| struct search { | ||||
| 	/* Stack frame for bio_complete */ | ||||
| 	struct closure		cl; | ||||
| 
 | ||||
| 	struct bcache_device	*d; | ||||
| 	struct task_struct	*task; | ||||
| 
 | ||||
| 	struct bbio		bio; | ||||
| 	struct bio		*orig_bio; | ||||
| 	struct bio		*cache_miss; | ||||
| 	unsigned		cache_bio_sectors; | ||||
| 
 | ||||
| 	unsigned		recoverable:1; | ||||
| 	unsigned		unaligned_bvec:1; | ||||
| 
 | ||||
| 	unsigned		write:1; | ||||
| 	unsigned		writeback:1; | ||||
| 
 | ||||
| 	/* IO error returned to s->bio */ | ||||
| 	short			error; | ||||
| 	unsigned long		start_time; | ||||
| 
 | ||||
| 	/* Anything past op->keys won't get zeroed in do_bio_hook */ | ||||
| 	struct btree_op		op; | ||||
| }; | ||||
| 
 | ||||
| void bch_cache_read_endio(struct bio *, int); | ||||
| int bch_get_congested(struct cache_set *); | ||||
| void bch_insert_data(struct closure *cl); | ||||
| void bch_btree_insert_async(struct closure *); | ||||
| void bch_cache_read_endio(struct bio *, int); | ||||
| 
 | ||||
| void bch_open_buckets_free(struct cache_set *); | ||||
| int bch_open_buckets_alloc(struct cache_set *); | ||||
| 
 | ||||
| void bch_cached_dev_request_init(struct cached_dev *dc); | ||||
| void bch_flash_dev_request_init(struct bcache_device *d); | ||||
| 
 | ||||
| extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; | ||||
| 
 | ||||
| struct bch_cgroup { | ||||
| #ifdef CONFIG_CGROUP_BCACHE | ||||
| 	struct cgroup_subsys_state	css; | ||||
| #endif | ||||
| 	/*
 | ||||
| 	 * We subtract one from the index into bch_cache_modes[], so that | ||||
| 	 * default == -1; this makes it so the rest match up with d->cache_mode, | ||||
| 	 * and we use d->cache_mode if cgrp->cache_mode < 0 | ||||
| 	 */ | ||||
| 	short				cache_mode; | ||||
| 	bool				verify; | ||||
| 	struct cache_stat_collector	stats; | ||||
| }; | ||||
| 
 | ||||
| struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); | ||||
| 
 | ||||
| #endif /* _BCACHE_REQUEST_H_ */ | ||||
							
								
								
									
										245
									
								
								drivers/md/bcache/stats.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										245
									
								
								drivers/md/bcache/stats.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,245 @@ | |||
| /*
 | ||||
|  * bcache stats code | ||||
|  * | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "stats.h" | ||||
| #include "btree.h" | ||||
| #include "request.h" | ||||
| #include "sysfs.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * We keep absolute totals of various statistics, and addionally a set of three | ||||
|  * rolling averages. | ||||
|  * | ||||
|  * Every so often, a timer goes off and rescales the rolling averages. | ||||
|  * accounting_rescale[] is how many times the timer has to go off before we | ||||
|  * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, | ||||
|  * and one day. | ||||
|  * | ||||
|  * accounting_delay is how often the timer goes off - 22 times in 5 minutes, | ||||
|  * and accounting_weight is what we use to rescale: | ||||
|  * | ||||
|  * pow(31 / 32, 22) ~= 1/2 | ||||
|  * | ||||
|  * So that we don't have to increment each set of numbers every time we (say) | ||||
|  * get a cache hit, we increment a single atomic_t in acc->collector, and when | ||||
|  * the rescale function runs it resets the atomic counter to 0 and adds its | ||||
|  * old value to each of the exported numbers. | ||||
|  * | ||||
|  * To reduce rounding error, the numbers in struct cache_stats are all | ||||
|  * stored left shifted by 16, and scaled back in the sysfs show() function. | ||||
|  */ | ||||
| 
 | ||||
| static const unsigned DAY_RESCALE		= 288; | ||||
| static const unsigned HOUR_RESCALE		= 12; | ||||
| static const unsigned FIVE_MINUTE_RESCALE	= 1; | ||||
| static const unsigned accounting_delay		= (HZ * 300) / 22; | ||||
| static const unsigned accounting_weight		= 32; | ||||
| 
 | ||||
| /* sysfs reading/writing */ | ||||
| 
 | ||||
| read_attribute(cache_hits); | ||||
| read_attribute(cache_misses); | ||||
| read_attribute(cache_bypass_hits); | ||||
| read_attribute(cache_bypass_misses); | ||||
| read_attribute(cache_hit_ratio); | ||||
| read_attribute(cache_readaheads); | ||||
| read_attribute(cache_miss_collisions); | ||||
| read_attribute(bypassed); | ||||
| 
 | ||||
| SHOW(bch_stats) | ||||
| { | ||||
| 	struct cache_stats *s = | ||||
| 		container_of(kobj, struct cache_stats, kobj); | ||||
| #define var(stat)		(s->stat >> 16) | ||||
| 	var_print(cache_hits); | ||||
| 	var_print(cache_misses); | ||||
| 	var_print(cache_bypass_hits); | ||||
| 	var_print(cache_bypass_misses); | ||||
| 
 | ||||
| 	sysfs_print(cache_hit_ratio, | ||||
| 		    DIV_SAFE(var(cache_hits) * 100, | ||||
| 			     var(cache_hits) + var(cache_misses))); | ||||
| 
 | ||||
| 	var_print(cache_readaheads); | ||||
| 	var_print(cache_miss_collisions); | ||||
| 	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9); | ||||
| #undef var | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| STORE(bch_stats) | ||||
| { | ||||
| 	return size; | ||||
| } | ||||
| 
 | ||||
| static void bch_stats_release(struct kobject *k) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static struct attribute *bch_stats_files[] = { | ||||
| 	&sysfs_cache_hits, | ||||
| 	&sysfs_cache_misses, | ||||
| 	&sysfs_cache_bypass_hits, | ||||
| 	&sysfs_cache_bypass_misses, | ||||
| 	&sysfs_cache_hit_ratio, | ||||
| 	&sysfs_cache_readaheads, | ||||
| 	&sysfs_cache_miss_collisions, | ||||
| 	&sysfs_bypassed, | ||||
| 	NULL | ||||
| }; | ||||
| static KTYPE(bch_stats); | ||||
| 
 | ||||
| static void scale_accounting(unsigned long data); | ||||
| 
 | ||||
| void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent) | ||||
| { | ||||
| 	kobject_init(&acc->total.kobj,		&bch_stats_ktype); | ||||
| 	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype); | ||||
| 	kobject_init(&acc->hour.kobj,		&bch_stats_ktype); | ||||
| 	kobject_init(&acc->day.kobj,		&bch_stats_ktype); | ||||
| 
 | ||||
| 	closure_init(&acc->cl, parent); | ||||
| 	init_timer(&acc->timer); | ||||
| 	acc->timer.expires	= jiffies + accounting_delay; | ||||
| 	acc->timer.data		= (unsigned long) acc; | ||||
| 	acc->timer.function	= scale_accounting; | ||||
| 	add_timer(&acc->timer); | ||||
| } | ||||
| 
 | ||||
| int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||||
| 				   struct kobject *parent) | ||||
| { | ||||
| 	int ret = kobject_add(&acc->total.kobj, parent, | ||||
| 			      "stats_total"); | ||||
| 	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, | ||||
| 				 "stats_five_minute"); | ||||
| 	ret = ret ?: kobject_add(&acc->hour.kobj, parent, | ||||
| 				 "stats_hour"); | ||||
| 	ret = ret ?: kobject_add(&acc->day.kobj, parent, | ||||
| 				 "stats_day"); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void bch_cache_accounting_clear(struct cache_accounting *acc) | ||||
| { | ||||
| 	memset(&acc->total.cache_hits, | ||||
| 	       0, | ||||
| 	       sizeof(unsigned long) * 7); | ||||
| } | ||||
| 
 | ||||
| void bch_cache_accounting_destroy(struct cache_accounting *acc) | ||||
| { | ||||
| 	kobject_put(&acc->total.kobj); | ||||
| 	kobject_put(&acc->five_minute.kobj); | ||||
| 	kobject_put(&acc->hour.kobj); | ||||
| 	kobject_put(&acc->day.kobj); | ||||
| 
 | ||||
| 	atomic_set(&acc->closing, 1); | ||||
| 	if (del_timer_sync(&acc->timer)) | ||||
| 		closure_return(&acc->cl); | ||||
| } | ||||
| 
 | ||||
| /* EWMA scaling */ | ||||
| 
 | ||||
| static void scale_stat(unsigned long *stat) | ||||
| { | ||||
| 	*stat =  ewma_add(*stat, 0, accounting_weight, 0); | ||||
| } | ||||
| 
 | ||||
| static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) | ||||
| { | ||||
| 	if (++stats->rescale == rescale_at) { | ||||
| 		stats->rescale = 0; | ||||
| 		scale_stat(&stats->cache_hits); | ||||
| 		scale_stat(&stats->cache_misses); | ||||
| 		scale_stat(&stats->cache_bypass_hits); | ||||
| 		scale_stat(&stats->cache_bypass_misses); | ||||
| 		scale_stat(&stats->cache_readaheads); | ||||
| 		scale_stat(&stats->cache_miss_collisions); | ||||
| 		scale_stat(&stats->sectors_bypassed); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void scale_accounting(unsigned long data) | ||||
| { | ||||
| 	struct cache_accounting *acc = (struct cache_accounting *) data; | ||||
| 
 | ||||
| #define move_stat(name) do {						\ | ||||
| 	unsigned t = atomic_xchg(&acc->collector.name, 0);		\ | ||||
| 	t <<= 16;							\ | ||||
| 	acc->five_minute.name += t;					\ | ||||
| 	acc->hour.name += t;						\ | ||||
| 	acc->day.name += t;						\ | ||||
| 	acc->total.name += t;						\ | ||||
| } while (0) | ||||
| 
 | ||||
| 	move_stat(cache_hits); | ||||
| 	move_stat(cache_misses); | ||||
| 	move_stat(cache_bypass_hits); | ||||
| 	move_stat(cache_bypass_misses); | ||||
| 	move_stat(cache_readaheads); | ||||
| 	move_stat(cache_miss_collisions); | ||||
| 	move_stat(sectors_bypassed); | ||||
| 
 | ||||
| 	scale_stats(&acc->total, 0); | ||||
| 	scale_stats(&acc->day, DAY_RESCALE); | ||||
| 	scale_stats(&acc->hour, HOUR_RESCALE); | ||||
| 	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); | ||||
| 
 | ||||
| 	acc->timer.expires += accounting_delay; | ||||
| 
 | ||||
| 	if (!atomic_read(&acc->closing)) | ||||
| 		add_timer(&acc->timer); | ||||
| 	else | ||||
| 		closure_return(&acc->cl); | ||||
| } | ||||
| 
 | ||||
| static void mark_cache_stats(struct cache_stat_collector *stats, | ||||
| 			     bool hit, bool bypass) | ||||
| { | ||||
| 	if (!bypass) | ||||
| 		if (hit) | ||||
| 			atomic_inc(&stats->cache_hits); | ||||
| 		else | ||||
| 			atomic_inc(&stats->cache_misses); | ||||
| 	else | ||||
| 		if (hit) | ||||
| 			atomic_inc(&stats->cache_bypass_hits); | ||||
| 		else | ||||
| 			atomic_inc(&stats->cache_bypass_misses); | ||||
| } | ||||
| 
 | ||||
| void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||||
| 	mark_cache_stats(&dc->accounting.collector, hit, bypass); | ||||
| 	mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); | ||||
| #ifdef CONFIG_CGROUP_BCACHE | ||||
| 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| void bch_mark_cache_readahead(struct search *s) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||||
| 	atomic_inc(&dc->accounting.collector.cache_readaheads); | ||||
| 	atomic_inc(&s->op.c->accounting.collector.cache_readaheads); | ||||
| } | ||||
| 
 | ||||
| void bch_mark_cache_miss_collision(struct search *s) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||||
| 	atomic_inc(&dc->accounting.collector.cache_miss_collisions); | ||||
| 	atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); | ||||
| } | ||||
| 
 | ||||
| void bch_mark_sectors_bypassed(struct search *s, int sectors) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||||
| 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); | ||||
| 	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); | ||||
| } | ||||
							
								
								
									
										58
									
								
								drivers/md/bcache/stats.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								drivers/md/bcache/stats.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,58 @@ | |||
| #ifndef _BCACHE_STATS_H_ | ||||
| #define _BCACHE_STATS_H_ | ||||
| 
 | ||||
| struct cache_stat_collector { | ||||
| 	atomic_t cache_hits; | ||||
| 	atomic_t cache_misses; | ||||
| 	atomic_t cache_bypass_hits; | ||||
| 	atomic_t cache_bypass_misses; | ||||
| 	atomic_t cache_readaheads; | ||||
| 	atomic_t cache_miss_collisions; | ||||
| 	atomic_t sectors_bypassed; | ||||
| }; | ||||
| 
 | ||||
| struct cache_stats { | ||||
| 	struct kobject		kobj; | ||||
| 
 | ||||
| 	unsigned long cache_hits; | ||||
| 	unsigned long cache_misses; | ||||
| 	unsigned long cache_bypass_hits; | ||||
| 	unsigned long cache_bypass_misses; | ||||
| 	unsigned long cache_readaheads; | ||||
| 	unsigned long cache_miss_collisions; | ||||
| 	unsigned long sectors_bypassed; | ||||
| 
 | ||||
| 	unsigned		rescale; | ||||
| }; | ||||
| 
 | ||||
| struct cache_accounting { | ||||
| 	struct closure		cl; | ||||
| 	struct timer_list	timer; | ||||
| 	atomic_t		closing; | ||||
| 
 | ||||
| 	struct cache_stat_collector collector; | ||||
| 
 | ||||
| 	struct cache_stats total; | ||||
| 	struct cache_stats five_minute; | ||||
| 	struct cache_stats hour; | ||||
| 	struct cache_stats day; | ||||
| }; | ||||
| 
 | ||||
| struct search; | ||||
| 
 | ||||
| void bch_cache_accounting_init(struct cache_accounting *acc, | ||||
| 			       struct closure *parent); | ||||
| 
 | ||||
| int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||||
| 				   struct kobject *parent); | ||||
| 
 | ||||
| void bch_cache_accounting_clear(struct cache_accounting *acc); | ||||
| 
 | ||||
| void bch_cache_accounting_destroy(struct cache_accounting *acc); | ||||
| 
 | ||||
| void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); | ||||
| void bch_mark_cache_readahead(struct search *s); | ||||
| void bch_mark_cache_miss_collision(struct search *s); | ||||
| void bch_mark_sectors_bypassed(struct search *s, int sectors); | ||||
| 
 | ||||
| #endif /* _BCACHE_STATS_H_ */ | ||||
							
								
								
									
										1941
									
								
								drivers/md/bcache/super.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1941
									
								
								drivers/md/bcache/super.c
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										817
									
								
								drivers/md/bcache/sysfs.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										817
									
								
								drivers/md/bcache/sysfs.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,817 @@ | |||
| /*
 | ||||
|  * bcache sysfs interfaces | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "sysfs.h" | ||||
| #include "btree.h" | ||||
| #include "request.h" | ||||
| 
 | ||||
| #include <linux/sort.h> | ||||
| 
 | ||||
| static const char * const cache_replacement_policies[] = { | ||||
| 	"lru", | ||||
| 	"fifo", | ||||
| 	"random", | ||||
| 	NULL | ||||
| }; | ||||
| 
 | ||||
| write_attribute(attach); | ||||
| write_attribute(detach); | ||||
| write_attribute(unregister); | ||||
| write_attribute(stop); | ||||
| write_attribute(clear_stats); | ||||
| write_attribute(trigger_gc); | ||||
| write_attribute(prune_cache); | ||||
| write_attribute(flash_vol_create); | ||||
| 
 | ||||
| read_attribute(bucket_size); | ||||
| read_attribute(block_size); | ||||
| read_attribute(nbuckets); | ||||
| read_attribute(tree_depth); | ||||
| read_attribute(root_usage_percent); | ||||
| read_attribute(priority_stats); | ||||
| read_attribute(btree_cache_size); | ||||
| read_attribute(btree_cache_max_chain); | ||||
| read_attribute(cache_available_percent); | ||||
| read_attribute(written); | ||||
| read_attribute(btree_written); | ||||
| read_attribute(metadata_written); | ||||
| read_attribute(active_journal_entries); | ||||
| 
 | ||||
| sysfs_time_stats_attribute(btree_gc,	sec, ms); | ||||
| sysfs_time_stats_attribute(btree_split, sec, us); | ||||
| sysfs_time_stats_attribute(btree_sort,	ms,  us); | ||||
| sysfs_time_stats_attribute(btree_read,	ms,  us); | ||||
| sysfs_time_stats_attribute(try_harder,	ms,  us); | ||||
| 
 | ||||
| read_attribute(btree_nodes); | ||||
| read_attribute(btree_used_percent); | ||||
| read_attribute(average_key_size); | ||||
| read_attribute(dirty_data); | ||||
| read_attribute(bset_tree_stats); | ||||
| 
 | ||||
| read_attribute(state); | ||||
| read_attribute(cache_read_races); | ||||
| read_attribute(writeback_keys_done); | ||||
| read_attribute(writeback_keys_failed); | ||||
| read_attribute(io_errors); | ||||
| read_attribute(congested); | ||||
| rw_attribute(congested_read_threshold_us); | ||||
| rw_attribute(congested_write_threshold_us); | ||||
| 
 | ||||
| rw_attribute(sequential_cutoff); | ||||
| rw_attribute(sequential_merge); | ||||
| rw_attribute(data_csum); | ||||
| rw_attribute(cache_mode); | ||||
| rw_attribute(writeback_metadata); | ||||
| rw_attribute(writeback_running); | ||||
| rw_attribute(writeback_percent); | ||||
| rw_attribute(writeback_delay); | ||||
| rw_attribute(writeback_rate); | ||||
| 
 | ||||
| rw_attribute(writeback_rate_update_seconds); | ||||
| rw_attribute(writeback_rate_d_term); | ||||
| rw_attribute(writeback_rate_p_term_inverse); | ||||
| rw_attribute(writeback_rate_d_smooth); | ||||
| read_attribute(writeback_rate_debug); | ||||
| 
 | ||||
| rw_attribute(synchronous); | ||||
| rw_attribute(journal_delay_ms); | ||||
| rw_attribute(discard); | ||||
| rw_attribute(running); | ||||
| rw_attribute(label); | ||||
| rw_attribute(readahead); | ||||
| rw_attribute(io_error_limit); | ||||
| rw_attribute(io_error_halflife); | ||||
| rw_attribute(verify); | ||||
| rw_attribute(key_merging_disabled); | ||||
| rw_attribute(gc_always_rewrite); | ||||
| rw_attribute(freelist_percent); | ||||
| rw_attribute(cache_replacement_policy); | ||||
| rw_attribute(btree_shrinker_disabled); | ||||
| rw_attribute(copy_gc_enabled); | ||||
| rw_attribute(size); | ||||
| 
 | ||||
| SHOW(__bch_cached_dev) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||||
| 					     disk.kobj); | ||||
| 	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; | ||||
| 
 | ||||
| #define var(stat)		(dc->stat) | ||||
| 
 | ||||
| 	if (attr == &sysfs_cache_mode) | ||||
| 		return snprint_string_list(buf, PAGE_SIZE, | ||||
| 					   bch_cache_modes + 1, | ||||
| 					   BDEV_CACHE_MODE(&dc->sb)); | ||||
| 
 | ||||
| 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum); | ||||
| 	var_printf(verify,		"%i"); | ||||
| 	var_printf(writeback_metadata,	"%i"); | ||||
| 	var_printf(writeback_running,	"%i"); | ||||
| 	var_print(writeback_delay); | ||||
| 	var_print(writeback_percent); | ||||
| 	sysfs_print(writeback_rate,	dc->writeback_rate.rate); | ||||
| 
 | ||||
| 	var_print(writeback_rate_update_seconds); | ||||
| 	var_print(writeback_rate_d_term); | ||||
| 	var_print(writeback_rate_p_term_inverse); | ||||
| 	var_print(writeback_rate_d_smooth); | ||||
| 
 | ||||
| 	if (attr == &sysfs_writeback_rate_debug) { | ||||
| 		char dirty[20]; | ||||
| 		char derivative[20]; | ||||
| 		char target[20]; | ||||
| 		hprint(dirty, | ||||
| 		       atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||||
| 		hprint(derivative,	dc->writeback_rate_derivative << 9); | ||||
| 		hprint(target,		dc->writeback_rate_target << 9); | ||||
| 
 | ||||
| 		return sprintf(buf, | ||||
| 			       "rate:\t\t%u\n" | ||||
| 			       "change:\t\t%i\n" | ||||
| 			       "dirty:\t\t%s\n" | ||||
| 			       "derivative:\t%s\n" | ||||
| 			       "target:\t\t%s\n", | ||||
| 			       dc->writeback_rate.rate, | ||||
| 			       dc->writeback_rate_change, | ||||
| 			       dirty, derivative, target); | ||||
| 	} | ||||
| 
 | ||||
| 	sysfs_hprint(dirty_data, | ||||
| 		     atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||||
| 
 | ||||
| 	var_printf(sequential_merge,	"%i"); | ||||
| 	var_hprint(sequential_cutoff); | ||||
| 	var_hprint(readahead); | ||||
| 
 | ||||
| 	sysfs_print(running,		atomic_read(&dc->running)); | ||||
| 	sysfs_print(state,		states[BDEV_STATE(&dc->sb)]); | ||||
| 
 | ||||
| 	if (attr == &sysfs_label) { | ||||
| 		memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | ||||
| 		buf[SB_LABEL_SIZE + 1] = '\0'; | ||||
| 		strcat(buf, "\n"); | ||||
| 		return strlen(buf); | ||||
| 	} | ||||
| 
 | ||||
| #undef var | ||||
| 	return 0; | ||||
| } | ||||
| SHOW_LOCKED(bch_cached_dev) | ||||
| 
 | ||||
| STORE(__cached_dev) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||||
| 					     disk.kobj); | ||||
| 	unsigned v = size; | ||||
| 	struct cache_set *c; | ||||
| 
 | ||||
| #define d_strtoul(var)		sysfs_strtoul(var, dc->var) | ||||
| #define d_strtoi_h(var)		sysfs_hatoi(var, dc->var) | ||||
| 
 | ||||
| 	sysfs_strtoul(data_csum,	dc->disk.data_csum); | ||||
| 	d_strtoul(verify); | ||||
| 	d_strtoul(writeback_metadata); | ||||
| 	d_strtoul(writeback_running); | ||||
| 	d_strtoul(writeback_delay); | ||||
| 	sysfs_strtoul_clamp(writeback_rate, | ||||
| 			    dc->writeback_rate.rate, 1, 1000000); | ||||
| 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); | ||||
| 
 | ||||
| 	d_strtoul(writeback_rate_update_seconds); | ||||
| 	d_strtoul(writeback_rate_d_term); | ||||
| 	d_strtoul(writeback_rate_p_term_inverse); | ||||
| 	sysfs_strtoul_clamp(writeback_rate_p_term_inverse, | ||||
| 			    dc->writeback_rate_p_term_inverse, 1, INT_MAX); | ||||
| 	d_strtoul(writeback_rate_d_smooth); | ||||
| 
 | ||||
| 	d_strtoul(sequential_merge); | ||||
| 	d_strtoi_h(sequential_cutoff); | ||||
| 	d_strtoi_h(readahead); | ||||
| 
 | ||||
| 	if (attr == &sysfs_clear_stats) | ||||
| 		bch_cache_accounting_clear(&dc->accounting); | ||||
| 
 | ||||
| 	if (attr == &sysfs_running && | ||||
| 	    strtoul_or_return(buf)) | ||||
| 		bch_cached_dev_run(dc); | ||||
| 
 | ||||
| 	if (attr == &sysfs_cache_mode) { | ||||
| 		ssize_t v = read_string_list(buf, bch_cache_modes + 1); | ||||
| 
 | ||||
| 		if (v < 0) | ||||
| 			return v; | ||||
| 
 | ||||
| 		if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { | ||||
| 			SET_BDEV_CACHE_MODE(&dc->sb, v); | ||||
| 			bch_write_bdev_super(dc, NULL); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_label) { | ||||
| 		memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | ||||
| 		bch_write_bdev_super(dc, NULL); | ||||
| 		if (dc->disk.c) { | ||||
| 			memcpy(dc->disk.c->uuids[dc->disk.id].label, | ||||
| 			       buf, SB_LABEL_SIZE); | ||||
| 			bch_uuid_write(dc->disk.c); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_attach) { | ||||
| 		if (parse_uuid(buf, dc->sb.set_uuid) < 16) | ||||
| 			return -EINVAL; | ||||
| 
 | ||||
| 		list_for_each_entry(c, &bch_cache_sets, list) { | ||||
| 			v = bch_cached_dev_attach(dc, c); | ||||
| 			if (!v) | ||||
| 				return size; | ||||
| 		} | ||||
| 
 | ||||
| 		pr_err("Can't attach %s: cache set not found", buf); | ||||
| 		size = v; | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_detach && dc->disk.c) | ||||
| 		bch_cached_dev_detach(dc); | ||||
| 
 | ||||
| 	if (attr == &sysfs_stop) | ||||
| 		bcache_device_stop(&dc->disk); | ||||
| 
 | ||||
| 	return size; | ||||
| } | ||||
| 
 | ||||
| STORE(bch_cached_dev) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||||
| 					     disk.kobj); | ||||
| 
 | ||||
| 	mutex_lock(&bch_register_lock); | ||||
| 	size = __cached_dev_store(kobj, attr, buf, size); | ||||
| 
 | ||||
| 	if (attr == &sysfs_writeback_running) | ||||
| 		bch_writeback_queue(dc); | ||||
| 
 | ||||
| 	if (attr == &sysfs_writeback_percent) | ||||
| 		schedule_delayed_work(&dc->writeback_rate_update, | ||||
| 				      dc->writeback_rate_update_seconds * HZ); | ||||
| 
 | ||||
| 	mutex_unlock(&bch_register_lock); | ||||
| 	return size; | ||||
| } | ||||
| 
 | ||||
| static struct attribute *bch_cached_dev_files[] = { | ||||
| 	&sysfs_attach, | ||||
| 	&sysfs_detach, | ||||
| 	&sysfs_stop, | ||||
| #if 0 | ||||
| 	&sysfs_data_csum, | ||||
| #endif | ||||
| 	&sysfs_cache_mode, | ||||
| 	&sysfs_writeback_metadata, | ||||
| 	&sysfs_writeback_running, | ||||
| 	&sysfs_writeback_delay, | ||||
| 	&sysfs_writeback_percent, | ||||
| 	&sysfs_writeback_rate, | ||||
| 	&sysfs_writeback_rate_update_seconds, | ||||
| 	&sysfs_writeback_rate_d_term, | ||||
| 	&sysfs_writeback_rate_p_term_inverse, | ||||
| 	&sysfs_writeback_rate_d_smooth, | ||||
| 	&sysfs_writeback_rate_debug, | ||||
| 	&sysfs_dirty_data, | ||||
| 	&sysfs_sequential_cutoff, | ||||
| 	&sysfs_sequential_merge, | ||||
| 	&sysfs_clear_stats, | ||||
| 	&sysfs_running, | ||||
| 	&sysfs_state, | ||||
| 	&sysfs_label, | ||||
| 	&sysfs_readahead, | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	&sysfs_verify, | ||||
| #endif | ||||
| 	NULL | ||||
| }; | ||||
| KTYPE(bch_cached_dev); | ||||
| 
 | ||||
| SHOW(bch_flash_dev) | ||||
| { | ||||
| 	struct bcache_device *d = container_of(kobj, struct bcache_device, | ||||
| 					       kobj); | ||||
| 	struct uuid_entry *u = &d->c->uuids[d->id]; | ||||
| 
 | ||||
| 	sysfs_printf(data_csum,	"%i", d->data_csum); | ||||
| 	sysfs_hprint(size,	u->sectors << 9); | ||||
| 
 | ||||
| 	if (attr == &sysfs_label) { | ||||
| 		memcpy(buf, u->label, SB_LABEL_SIZE); | ||||
| 		buf[SB_LABEL_SIZE + 1] = '\0'; | ||||
| 		strcat(buf, "\n"); | ||||
| 		return strlen(buf); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| STORE(__bch_flash_dev) | ||||
| { | ||||
| 	struct bcache_device *d = container_of(kobj, struct bcache_device, | ||||
| 					       kobj); | ||||
| 	struct uuid_entry *u = &d->c->uuids[d->id]; | ||||
| 
 | ||||
| 	sysfs_strtoul(data_csum,	d->data_csum); | ||||
| 
 | ||||
| 	if (attr == &sysfs_size) { | ||||
| 		uint64_t v; | ||||
| 		strtoi_h_or_return(buf, v); | ||||
| 
 | ||||
| 		u->sectors = v >> 9; | ||||
| 		bch_uuid_write(d->c); | ||||
| 		set_capacity(d->disk, u->sectors); | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_label) { | ||||
| 		memcpy(u->label, buf, SB_LABEL_SIZE); | ||||
| 		bch_uuid_write(d->c); | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_unregister) { | ||||
| 		atomic_set(&d->detaching, 1); | ||||
| 		bcache_device_stop(d); | ||||
| 	} | ||||
| 
 | ||||
| 	return size; | ||||
| } | ||||
| STORE_LOCKED(bch_flash_dev) | ||||
| 
 | ||||
| static struct attribute *bch_flash_dev_files[] = { | ||||
| 	&sysfs_unregister, | ||||
| #if 0 | ||||
| 	&sysfs_data_csum, | ||||
| #endif | ||||
| 	&sysfs_label, | ||||
| 	&sysfs_size, | ||||
| 	NULL | ||||
| }; | ||||
| KTYPE(bch_flash_dev); | ||||
| 
 | ||||
| SHOW(__bch_cache_set) | ||||
| { | ||||
| 	unsigned root_usage(struct cache_set *c) | ||||
| 	{ | ||||
| 		unsigned bytes = 0; | ||||
| 		struct bkey *k; | ||||
| 		struct btree *b; | ||||
| 		struct btree_iter iter; | ||||
| 
 | ||||
| 		goto lock_root; | ||||
| 
 | ||||
| 		do { | ||||
| 			rw_unlock(false, b); | ||||
| lock_root: | ||||
| 			b = c->root; | ||||
| 			rw_lock(false, b, b->level); | ||||
| 		} while (b != c->root); | ||||
| 
 | ||||
| 		for_each_key_filter(b, k, &iter, bch_ptr_bad) | ||||
| 			bytes += bkey_bytes(k); | ||||
| 
 | ||||
| 		rw_unlock(false, b); | ||||
| 
 | ||||
| 		return (bytes * 100) / btree_bytes(c); | ||||
| 	} | ||||
| 
 | ||||
| 	size_t cache_size(struct cache_set *c) | ||||
| 	{ | ||||
| 		size_t ret = 0; | ||||
| 		struct btree *b; | ||||
| 
 | ||||
| 		mutex_lock(&c->bucket_lock); | ||||
| 		list_for_each_entry(b, &c->btree_cache, list) | ||||
| 			ret += 1 << (b->page_order + PAGE_SHIFT); | ||||
| 
 | ||||
| 		mutex_unlock(&c->bucket_lock); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	unsigned cache_max_chain(struct cache_set *c) | ||||
| 	{ | ||||
| 		unsigned ret = 0; | ||||
| 		struct hlist_head *h; | ||||
| 
 | ||||
| 		mutex_lock(&c->bucket_lock); | ||||
| 
 | ||||
| 		for (h = c->bucket_hash; | ||||
| 		     h < c->bucket_hash + (1 << BUCKET_HASH_BITS); | ||||
| 		     h++) { | ||||
| 			unsigned i = 0; | ||||
| 			struct hlist_node *p; | ||||
| 
 | ||||
| 			hlist_for_each(p, h) | ||||
| 				i++; | ||||
| 
 | ||||
| 			ret = max(ret, i); | ||||
| 		} | ||||
| 
 | ||||
| 		mutex_unlock(&c->bucket_lock); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	unsigned btree_used(struct cache_set *c) | ||||
| 	{ | ||||
| 		return div64_u64(c->gc_stats.key_bytes * 100, | ||||
| 				 (c->gc_stats.nodes ?: 1) * btree_bytes(c)); | ||||
| 	} | ||||
| 
 | ||||
| 	unsigned average_key_size(struct cache_set *c) | ||||
| 	{ | ||||
| 		return c->gc_stats.nkeys | ||||
| 			? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | ||||
| 			: 0; | ||||
| 	} | ||||
| 
 | ||||
| 	struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||||
| 
 | ||||
| 	sysfs_print(synchronous,		CACHE_SYNC(&c->sb)); | ||||
| 	sysfs_print(journal_delay_ms,		c->journal_delay_ms); | ||||
| 	sysfs_hprint(bucket_size,		bucket_bytes(c)); | ||||
| 	sysfs_hprint(block_size,		block_bytes(c)); | ||||
| 	sysfs_print(tree_depth,			c->root->level); | ||||
| 	sysfs_print(root_usage_percent,		root_usage(c)); | ||||
| 
 | ||||
| 	sysfs_hprint(btree_cache_size,		cache_size(c)); | ||||
| 	sysfs_print(btree_cache_max_chain,	cache_max_chain(c)); | ||||
| 	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use); | ||||
| 
 | ||||
| 	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms); | ||||
| 	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us); | ||||
| 	sysfs_print_time_stats(&c->sort_time,		btree_sort, ms, us); | ||||
| 	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us); | ||||
| 	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us); | ||||
| 
 | ||||
| 	sysfs_print(btree_used_percent,	btree_used(c)); | ||||
| 	sysfs_print(btree_nodes,	c->gc_stats.nodes); | ||||
| 	sysfs_hprint(dirty_data,	c->gc_stats.dirty); | ||||
| 	sysfs_hprint(average_key_size,	average_key_size(c)); | ||||
| 
 | ||||
| 	sysfs_print(cache_read_races, | ||||
| 		    atomic_long_read(&c->cache_read_races)); | ||||
| 
 | ||||
| 	sysfs_print(writeback_keys_done, | ||||
| 		    atomic_long_read(&c->writeback_keys_done)); | ||||
| 	sysfs_print(writeback_keys_failed, | ||||
| 		    atomic_long_read(&c->writeback_keys_failed)); | ||||
| 
 | ||||
| 	/* See count_io_errors for why 88 */ | ||||
| 	sysfs_print(io_error_halflife,	c->error_decay * 88); | ||||
| 	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT); | ||||
| 
 | ||||
| 	sysfs_hprint(congested, | ||||
| 		     ((uint64_t) bch_get_congested(c)) << 9); | ||||
| 	sysfs_print(congested_read_threshold_us, | ||||
| 		    c->congested_read_threshold_us); | ||||
| 	sysfs_print(congested_write_threshold_us, | ||||
| 		    c->congested_write_threshold_us); | ||||
| 
 | ||||
| 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin)); | ||||
| 	sysfs_printf(verify,			"%i", c->verify); | ||||
| 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled); | ||||
| 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite); | ||||
| 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled); | ||||
| 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled); | ||||
| 
 | ||||
| 	if (attr == &sysfs_bset_tree_stats) | ||||
| 		return bch_bset_print_stats(c, buf); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| SHOW_LOCKED(bch_cache_set) | ||||
| 
 | ||||
| STORE(__bch_cache_set) | ||||
| { | ||||
| 	struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||||
| 
 | ||||
| 	if (attr == &sysfs_unregister) | ||||
| 		bch_cache_set_unregister(c); | ||||
| 
 | ||||
| 	if (attr == &sysfs_stop) | ||||
| 		bch_cache_set_stop(c); | ||||
| 
 | ||||
| 	if (attr == &sysfs_synchronous) { | ||||
| 		bool sync = strtoul_or_return(buf); | ||||
| 
 | ||||
| 		if (sync != CACHE_SYNC(&c->sb)) { | ||||
| 			SET_CACHE_SYNC(&c->sb, sync); | ||||
| 			bcache_write_super(c); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_flash_vol_create) { | ||||
| 		int r; | ||||
| 		uint64_t v; | ||||
| 		strtoi_h_or_return(buf, v); | ||||
| 
 | ||||
| 		r = bch_flash_dev_create(c, v); | ||||
| 		if (r) | ||||
| 			return r; | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_clear_stats) { | ||||
| 		atomic_long_set(&c->writeback_keys_done,	0); | ||||
| 		atomic_long_set(&c->writeback_keys_failed,	0); | ||||
| 
 | ||||
| 		memset(&c->gc_stats, 0, sizeof(struct gc_stat)); | ||||
| 		bch_cache_accounting_clear(&c->accounting); | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_trigger_gc) | ||||
| 		bch_queue_gc(c); | ||||
| 
 | ||||
| 	if (attr == &sysfs_prune_cache) { | ||||
| 		struct shrink_control sc; | ||||
| 		sc.gfp_mask = GFP_KERNEL; | ||||
| 		sc.nr_to_scan = strtoul_or_return(buf); | ||||
| 		c->shrink.shrink(&c->shrink, &sc); | ||||
| 	} | ||||
| 
 | ||||
| 	sysfs_strtoul(congested_read_threshold_us, | ||||
| 		      c->congested_read_threshold_us); | ||||
| 	sysfs_strtoul(congested_write_threshold_us, | ||||
| 		      c->congested_write_threshold_us); | ||||
| 
 | ||||
| 	if (attr == &sysfs_io_error_limit) | ||||
| 		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | ||||
| 
 | ||||
| 	/* See count_io_errors() for why 88 */ | ||||
| 	if (attr == &sysfs_io_error_halflife) | ||||
| 		c->error_decay = strtoul_or_return(buf) / 88; | ||||
| 
 | ||||
| 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms); | ||||
| 	sysfs_strtoul(verify,			c->verify); | ||||
| 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled); | ||||
| 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite); | ||||
| 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled); | ||||
| 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled); | ||||
| 
 | ||||
| 	return size; | ||||
| } | ||||
| STORE_LOCKED(bch_cache_set) | ||||
| 
 | ||||
| SHOW(bch_cache_set_internal) | ||||
| { | ||||
| 	struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||||
| 	return bch_cache_set_show(&c->kobj, attr, buf); | ||||
| } | ||||
| 
 | ||||
| STORE(bch_cache_set_internal) | ||||
| { | ||||
| 	struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||||
| 	return bch_cache_set_store(&c->kobj, attr, buf, size); | ||||
| } | ||||
| 
 | ||||
| static void bch_cache_set_internal_release(struct kobject *k) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static struct attribute *bch_cache_set_files[] = { | ||||
| 	&sysfs_unregister, | ||||
| 	&sysfs_stop, | ||||
| 	&sysfs_synchronous, | ||||
| 	&sysfs_journal_delay_ms, | ||||
| 	&sysfs_flash_vol_create, | ||||
| 
 | ||||
| 	&sysfs_bucket_size, | ||||
| 	&sysfs_block_size, | ||||
| 	&sysfs_tree_depth, | ||||
| 	&sysfs_root_usage_percent, | ||||
| 	&sysfs_btree_cache_size, | ||||
| 	&sysfs_cache_available_percent, | ||||
| 
 | ||||
| 	&sysfs_average_key_size, | ||||
| 	&sysfs_dirty_data, | ||||
| 
 | ||||
| 	&sysfs_io_error_limit, | ||||
| 	&sysfs_io_error_halflife, | ||||
| 	&sysfs_congested, | ||||
| 	&sysfs_congested_read_threshold_us, | ||||
| 	&sysfs_congested_write_threshold_us, | ||||
| 	&sysfs_clear_stats, | ||||
| 	NULL | ||||
| }; | ||||
| KTYPE(bch_cache_set); | ||||
| 
 | ||||
| static struct attribute *bch_cache_set_internal_files[] = { | ||||
| 	&sysfs_active_journal_entries, | ||||
| 
 | ||||
| 	sysfs_time_stats_attribute_list(btree_gc, sec, ms) | ||||
| 	sysfs_time_stats_attribute_list(btree_split, sec, us) | ||||
| 	sysfs_time_stats_attribute_list(btree_sort, ms, us) | ||||
| 	sysfs_time_stats_attribute_list(btree_read, ms, us) | ||||
| 	sysfs_time_stats_attribute_list(try_harder, ms, us) | ||||
| 
 | ||||
| 	&sysfs_btree_nodes, | ||||
| 	&sysfs_btree_used_percent, | ||||
| 	&sysfs_btree_cache_max_chain, | ||||
| 
 | ||||
| 	&sysfs_bset_tree_stats, | ||||
| 	&sysfs_cache_read_races, | ||||
| 	&sysfs_writeback_keys_done, | ||||
| 	&sysfs_writeback_keys_failed, | ||||
| 
 | ||||
| 	&sysfs_trigger_gc, | ||||
| 	&sysfs_prune_cache, | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	&sysfs_verify, | ||||
| 	&sysfs_key_merging_disabled, | ||||
| #endif | ||||
| 	&sysfs_gc_always_rewrite, | ||||
| 	&sysfs_btree_shrinker_disabled, | ||||
| 	&sysfs_copy_gc_enabled, | ||||
| 	NULL | ||||
| }; | ||||
| KTYPE(bch_cache_set_internal); | ||||
| 
 | ||||
| SHOW(__bch_cache) | ||||
| { | ||||
| 	struct cache *ca = container_of(kobj, struct cache, kobj); | ||||
| 
 | ||||
| 	sysfs_hprint(bucket_size,	bucket_bytes(ca)); | ||||
| 	sysfs_hprint(block_size,	block_bytes(ca)); | ||||
| 	sysfs_print(nbuckets,		ca->sb.nbuckets); | ||||
| 	sysfs_print(discard,		ca->discard); | ||||
| 	sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); | ||||
| 	sysfs_hprint(btree_written, | ||||
| 		     atomic_long_read(&ca->btree_sectors_written) << 9); | ||||
| 	sysfs_hprint(metadata_written, | ||||
| 		     (atomic_long_read(&ca->meta_sectors_written) + | ||||
| 		      atomic_long_read(&ca->btree_sectors_written)) << 9); | ||||
| 
 | ||||
| 	sysfs_print(io_errors, | ||||
| 		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | ||||
| 
 | ||||
| 	sysfs_print(freelist_percent, ca->free.size * 100 / | ||||
| 		    ((size_t) ca->sb.nbuckets)); | ||||
| 
 | ||||
| 	if (attr == &sysfs_cache_replacement_policy) | ||||
| 		return snprint_string_list(buf, PAGE_SIZE, | ||||
| 					   cache_replacement_policies, | ||||
| 					   CACHE_REPLACEMENT(&ca->sb)); | ||||
| 
 | ||||
| 	if (attr == &sysfs_priority_stats) { | ||||
| 		int cmp(const void *l, const void *r) | ||||
| 		{	return *((uint16_t *) r) - *((uint16_t *) l); } | ||||
| 
 | ||||
| 		/* Number of quantiles we compute */ | ||||
| 		const unsigned nq = 31; | ||||
| 
 | ||||
| 		size_t n = ca->sb.nbuckets, i, unused, btree; | ||||
| 		uint64_t sum = 0; | ||||
| 		uint16_t q[nq], *p, *cached; | ||||
| 		ssize_t ret; | ||||
| 
 | ||||
| 		cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | ||||
| 		if (!p) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		mutex_lock(&ca->set->bucket_lock); | ||||
| 		for (i = ca->sb.first_bucket; i < n; i++) | ||||
| 			p[i] = ca->buckets[i].prio; | ||||
| 		mutex_unlock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 		sort(p, n, sizeof(uint16_t), cmp, NULL); | ||||
| 
 | ||||
| 		while (n && | ||||
| 		       !cached[n - 1]) | ||||
| 			--n; | ||||
| 
 | ||||
| 		unused = ca->sb.nbuckets - n; | ||||
| 
 | ||||
| 		while (cached < p + n && | ||||
| 		       *cached == BTREE_PRIO) | ||||
| 			cached++; | ||||
| 
 | ||||
| 		btree = cached - p; | ||||
| 		n -= btree; | ||||
| 
 | ||||
| 		for (i = 0; i < n; i++) | ||||
| 			sum += INITIAL_PRIO - cached[i]; | ||||
| 
 | ||||
| 		if (n) | ||||
| 			do_div(sum, n); | ||||
| 
 | ||||
| 		for (i = 0; i < nq; i++) | ||||
| 			q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; | ||||
| 
 | ||||
| 		vfree(p); | ||||
| 
 | ||||
| 		ret = snprintf(buf, PAGE_SIZE, | ||||
| 			       "Unused:		%zu%%\n" | ||||
| 			       "Metadata:	%zu%%\n" | ||||
| 			       "Average:	%llu\n" | ||||
| 			       "Sectors per Q:	%zu\n" | ||||
| 			       "Quantiles:	[", | ||||
| 			       unused * 100 / (size_t) ca->sb.nbuckets, | ||||
| 			       btree * 100 / (size_t) ca->sb.nbuckets, sum, | ||||
| 			       n * ca->sb.bucket_size / (nq + 1)); | ||||
| 
 | ||||
| 		for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) | ||||
| 			ret += snprintf(buf + ret, PAGE_SIZE - ret, | ||||
| 					i < nq - 1 ? "%u " : "%u]\n", q[i]); | ||||
| 
 | ||||
| 		buf[PAGE_SIZE - 1] = '\0'; | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| SHOW_LOCKED(bch_cache) | ||||
| 
 | ||||
| STORE(__bch_cache) | ||||
| { | ||||
| 	struct cache *ca = container_of(kobj, struct cache, kobj); | ||||
| 
 | ||||
| 	if (attr == &sysfs_discard) { | ||||
| 		bool v = strtoul_or_return(buf); | ||||
| 
 | ||||
| 		if (blk_queue_discard(bdev_get_queue(ca->bdev))) | ||||
| 			ca->discard = v; | ||||
| 
 | ||||
| 		if (v != CACHE_DISCARD(&ca->sb)) { | ||||
| 			SET_CACHE_DISCARD(&ca->sb, v); | ||||
| 			bcache_write_super(ca->set); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_cache_replacement_policy) { | ||||
| 		ssize_t v = read_string_list(buf, cache_replacement_policies); | ||||
| 
 | ||||
| 		if (v < 0) | ||||
| 			return v; | ||||
| 
 | ||||
| 		if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { | ||||
| 			mutex_lock(&ca->set->bucket_lock); | ||||
| 			SET_CACHE_REPLACEMENT(&ca->sb, v); | ||||
| 			mutex_unlock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 			bcache_write_super(ca->set); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_freelist_percent) { | ||||
| 		DECLARE_FIFO(long, free); | ||||
| 		long i; | ||||
| 		size_t p = strtoul_or_return(buf); | ||||
| 
 | ||||
| 		p = clamp_t(size_t, | ||||
| 			    ((size_t) ca->sb.nbuckets * p) / 100, | ||||
| 			    roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||||
| 			    ca->sb.nbuckets / 2); | ||||
| 
 | ||||
| 		if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		mutex_lock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 		fifo_move(&free, &ca->free); | ||||
| 		fifo_swap(&free, &ca->free); | ||||
| 
 | ||||
| 		mutex_unlock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 		while (fifo_pop(&free, i)) | ||||
| 			atomic_dec(&ca->buckets[i].pin); | ||||
| 
 | ||||
| 		free_fifo(&free); | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_clear_stats) { | ||||
| 		atomic_long_set(&ca->sectors_written, 0); | ||||
| 		atomic_long_set(&ca->btree_sectors_written, 0); | ||||
| 		atomic_long_set(&ca->meta_sectors_written, 0); | ||||
| 		atomic_set(&ca->io_count, 0); | ||||
| 		atomic_set(&ca->io_errors, 0); | ||||
| 	} | ||||
| 
 | ||||
| 	return size; | ||||
| } | ||||
| STORE_LOCKED(bch_cache) | ||||
| 
 | ||||
| static struct attribute *bch_cache_files[] = { | ||||
| 	&sysfs_bucket_size, | ||||
| 	&sysfs_block_size, | ||||
| 	&sysfs_nbuckets, | ||||
| 	&sysfs_priority_stats, | ||||
| 	&sysfs_discard, | ||||
| 	&sysfs_written, | ||||
| 	&sysfs_btree_written, | ||||
| 	&sysfs_metadata_written, | ||||
| 	&sysfs_io_errors, | ||||
| 	&sysfs_clear_stats, | ||||
| 	&sysfs_freelist_percent, | ||||
| 	&sysfs_cache_replacement_policy, | ||||
| 	NULL | ||||
| }; | ||||
| KTYPE(bch_cache); | ||||
							
								
								
									
										110
									
								
								drivers/md/bcache/sysfs.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								drivers/md/bcache/sysfs.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,110 @@ | |||
| #ifndef _BCACHE_SYSFS_H_ | ||||
| #define _BCACHE_SYSFS_H_ | ||||
| 
 | ||||
| #define KTYPE(type)							\ | ||||
| struct kobj_type type ## _ktype = {					\ | ||||
| 	.release	= type ## _release,				\ | ||||
| 	.sysfs_ops	= &((const struct sysfs_ops) {			\ | ||||
| 		.show	= type ## _show,				\ | ||||
| 		.store	= type ## _store				\ | ||||
| 	}),								\ | ||||
| 	.default_attrs	= type ## _files				\ | ||||
| } | ||||
| 
 | ||||
| #define SHOW(fn)							\ | ||||
| static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ | ||||
| 			   char *buf)					\ | ||||
| 
 | ||||
| #define STORE(fn)							\ | ||||
| static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ | ||||
| 			    const char *buf, size_t size)		\ | ||||
| 
 | ||||
| #define SHOW_LOCKED(fn)							\ | ||||
| SHOW(fn)								\ | ||||
| {									\ | ||||
| 	ssize_t ret;							\ | ||||
| 	mutex_lock(&bch_register_lock);					\ | ||||
| 	ret = __ ## fn ## _show(kobj, attr, buf);			\ | ||||
| 	mutex_unlock(&bch_register_lock);				\ | ||||
| 	return ret;							\ | ||||
| } | ||||
| 
 | ||||
| #define STORE_LOCKED(fn)						\ | ||||
| STORE(fn)								\ | ||||
| {									\ | ||||
| 	ssize_t ret;							\ | ||||
| 	mutex_lock(&bch_register_lock);					\ | ||||
| 	ret = __ ## fn ## _store(kobj, attr, buf, size);		\ | ||||
| 	mutex_unlock(&bch_register_lock);				\ | ||||
| 	return ret;							\ | ||||
| } | ||||
| 
 | ||||
| #define __sysfs_attribute(_name, _mode)					\ | ||||
| 	static struct attribute sysfs_##_name =				\ | ||||
| 		{ .name = #_name, .mode = _mode } | ||||
| 
 | ||||
| #define write_attribute(n)	__sysfs_attribute(n, S_IWUSR) | ||||
| #define read_attribute(n)	__sysfs_attribute(n, S_IRUGO) | ||||
| #define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR) | ||||
| 
 | ||||
| #define sysfs_printf(file, fmt, ...)					\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file)					\ | ||||
| 		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define sysfs_print(file, var)						\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file)					\ | ||||
| 		return snprint(buf, PAGE_SIZE, var);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define sysfs_hprint(file, val)						\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file) {					\ | ||||
| 		ssize_t ret = hprint(buf, val);				\ | ||||
| 		strcat(buf, "\n");					\ | ||||
| 		return ret + 1;						\ | ||||
| 	}								\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var)) | ||||
| #define var_print(_var)		sysfs_print(_var, var(_var)) | ||||
| #define var_hprint(_var)	sysfs_hprint(_var, var(_var)) | ||||
| 
 | ||||
| #define sysfs_strtoul(file, var)					\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file)					\ | ||||
| 		return strtoul_safe(buf, var) ?: (ssize_t) size;	\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define sysfs_strtoul_clamp(file, var, min, max)			\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file)					\ | ||||
| 		return strtoul_safe_clamp(buf, var, min, max)		\ | ||||
| 			?: (ssize_t) size;				\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define strtoul_or_return(cp)						\ | ||||
| ({									\ | ||||
| 	unsigned long _v;						\ | ||||
| 	int _r = kstrtoul(cp, 10, &_v);					\ | ||||
| 	if (_r)								\ | ||||
| 		return _r;						\ | ||||
| 	_v;								\ | ||||
| }) | ||||
| 
 | ||||
| #define strtoi_h_or_return(cp, v)					\ | ||||
| do {									\ | ||||
| 	int _r = strtoi_h(cp, &v);					\ | ||||
| 	if (_r)								\ | ||||
| 		return _r;						\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define sysfs_hatoi(file, var)						\ | ||||
| do {									\ | ||||
| 	if (attr == &sysfs_ ## file)					\ | ||||
| 		return strtoi_h(buf, &var) ?: (ssize_t) size;		\ | ||||
| } while (0) | ||||
| 
 | ||||
| #endif  /* _BCACHE_SYSFS_H_ */ | ||||
							
								
								
									
										26
									
								
								drivers/md/bcache/trace.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								drivers/md/bcache/trace.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "request.h" | ||||
| 
 | ||||
| #include <linux/module.h> | ||||
| 
 | ||||
| #define CREATE_TRACE_POINTS | ||||
| #include <trace/events/bcache.h> | ||||
| 
 | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); | ||||
| EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); | ||||
							
								
								
									
										389
									
								
								drivers/md/bcache/util.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										389
									
								
								drivers/md/bcache/util.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,389 @@ | |||
| /*
 | ||||
|  * random utiility code, for bcache but in theory not specific to bcache | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/bio.h> | ||||
| #include <linux/blkdev.h> | ||||
| #include <linux/ctype.h> | ||||
| #include <linux/debugfs.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/seq_file.h> | ||||
| #include <linux/types.h> | ||||
| 
 | ||||
| #include "util.h" | ||||
| 
 | ||||
| #define simple_strtoint(c, end, base)	simple_strtol(c, end, base) | ||||
| #define simple_strtouint(c, end, base)	simple_strtoul(c, end, base) | ||||
| 
 | ||||
| #define STRTO_H(name, type)					\ | ||||
| int name ## _h(const char *cp, type *res)		        \ | ||||
| {								\ | ||||
| 	int u = 0;						\ | ||||
| 	char *e;						\ | ||||
| 	type i = simple_ ## name(cp, &e, 10);			\ | ||||
| 								\ | ||||
| 	switch (tolower(*e)) {					\ | ||||
| 	default:						\ | ||||
| 		return -EINVAL;					\ | ||||
| 	case 'y':						\ | ||||
| 	case 'z':						\ | ||||
| 		u++;						\ | ||||
| 	case 'e':						\ | ||||
| 		u++;						\ | ||||
| 	case 'p':						\ | ||||
| 		u++;						\ | ||||
| 	case 't':						\ | ||||
| 		u++;						\ | ||||
| 	case 'g':						\ | ||||
| 		u++;						\ | ||||
| 	case 'm':						\ | ||||
| 		u++;						\ | ||||
| 	case 'k':						\ | ||||
| 		u++;						\ | ||||
| 		if (e++ == cp)					\ | ||||
| 			return -EINVAL;				\ | ||||
| 	case '\n':						\ | ||||
| 	case '\0':						\ | ||||
| 		if (*e == '\n')					\ | ||||
| 			e++;					\ | ||||
| 	}							\ | ||||
| 								\ | ||||
| 	if (*e)							\ | ||||
| 		return -EINVAL;					\ | ||||
| 								\ | ||||
| 	while (u--) {						\ | ||||
| 		if ((type) ~0 > 0 &&				\ | ||||
| 		    (type) ~0 / 1024 <= i)			\ | ||||
| 			return -EINVAL;				\ | ||||
| 		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\ | ||||
| 		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\ | ||||
| 			return -EINVAL;				\ | ||||
| 		i *= 1024;					\ | ||||
| 	}							\ | ||||
| 								\ | ||||
| 	*res = i;						\ | ||||
| 	return 0;						\ | ||||
| }								\ | ||||
| EXPORT_SYMBOL_GPL(name ## _h); | ||||
| 
 | ||||
| STRTO_H(strtoint, int) | ||||
| STRTO_H(strtouint, unsigned int) | ||||
| STRTO_H(strtoll, long long) | ||||
| STRTO_H(strtoull, unsigned long long) | ||||
| 
 | ||||
| ssize_t hprint(char *buf, int64_t v) | ||||
| { | ||||
| 	static const char units[] = "?kMGTPEZY"; | ||||
| 	char dec[3] = ""; | ||||
| 	int u, t = 0; | ||||
| 
 | ||||
| 	for (u = 0; v >= 1024 || v <= -1024; u++) { | ||||
| 		t = v & ~(~0 << 10); | ||||
| 		v >>= 10; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!u) | ||||
| 		return sprintf(buf, "%llu", v); | ||||
| 
 | ||||
| 	if (v < 100 && v > -100) | ||||
| 		sprintf(dec, ".%i", t / 100); | ||||
| 
 | ||||
| 	return sprintf(buf, "%lli%s%c", v, dec, units[u]); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(hprint); | ||||
| 
 | ||||
| ssize_t snprint_string_list(char *buf, size_t size, const char * const list[], | ||||
| 			    size_t selected) | ||||
| { | ||||
| 	char *out = buf; | ||||
| 	size_t i; | ||||
| 
 | ||||
| 	for (i = 0; list[i]; i++) | ||||
| 		out += snprintf(out, buf + size - out, | ||||
| 				i == selected ? "[%s] " : "%s ", list[i]); | ||||
| 
 | ||||
| 	out[-1] = '\n'; | ||||
| 	return out - buf; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(snprint_string_list); | ||||
| 
 | ||||
| ssize_t read_string_list(const char *buf, const char * const list[]) | ||||
| { | ||||
| 	size_t i; | ||||
| 	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); | ||||
| 	if (!d) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	s = strim(d); | ||||
| 
 | ||||
| 	for (i = 0; list[i]; i++) | ||||
| 		if (!strcmp(list[i], s)) | ||||
| 			break; | ||||
| 
 | ||||
| 	kfree(d); | ||||
| 
 | ||||
| 	if (!list[i]) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	return i; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(read_string_list); | ||||
| 
 | ||||
| bool is_zero(const char *p, size_t n) | ||||
| { | ||||
| 	size_t i; | ||||
| 
 | ||||
| 	for (i = 0; i < n; i++) | ||||
| 		if (p[i]) | ||||
| 			return false; | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(is_zero); | ||||
| 
 | ||||
| int parse_uuid(const char *s, char *uuid) | ||||
| { | ||||
| 	size_t i, j, x; | ||||
| 	memset(uuid, 0, 16); | ||||
| 
 | ||||
| 	for (i = 0, j = 0; | ||||
| 	     i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; | ||||
| 	     i++) { | ||||
| 		x = s[i] | 32; | ||||
| 
 | ||||
| 		switch (x) { | ||||
| 		case '0'...'9': | ||||
| 			x -= '0'; | ||||
| 			break; | ||||
| 		case 'a'...'f': | ||||
| 			x -= 'a' - 10; | ||||
| 			break; | ||||
| 		default: | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!(j & 1)) | ||||
| 			x <<= 4; | ||||
| 		uuid[j++ >> 1] |= x; | ||||
| 	} | ||||
| 	return i; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(parse_uuid); | ||||
| 
 | ||||
| void time_stats_update(struct time_stats *stats, uint64_t start_time) | ||||
| { | ||||
| 	uint64_t now		= local_clock(); | ||||
| 	uint64_t duration	= time_after64(now, start_time) | ||||
| 		? now - start_time : 0; | ||||
| 	uint64_t last		= time_after64(now, stats->last) | ||||
| 		? now - stats->last : 0; | ||||
| 
 | ||||
| 	stats->max_duration = max(stats->max_duration, duration); | ||||
| 
 | ||||
| 	if (stats->last) { | ||||
| 		ewma_add(stats->average_duration, duration, 8, 8); | ||||
| 
 | ||||
| 		if (stats->average_frequency) | ||||
| 			ewma_add(stats->average_frequency, last, 8, 8); | ||||
| 		else | ||||
| 			stats->average_frequency  = last << 8; | ||||
| 	} else { | ||||
| 		stats->average_duration  = duration << 8; | ||||
| 	} | ||||
| 
 | ||||
| 	stats->last = now ?: 1; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(time_stats_update); | ||||
| 
 | ||||
| unsigned next_delay(struct ratelimit *d, uint64_t done) | ||||
| { | ||||
| 	uint64_t now = local_clock(); | ||||
| 
 | ||||
| 	d->next += div_u64(done, d->rate); | ||||
| 
 | ||||
| 	return time_after64(d->next, now) | ||||
| 		? div_u64(d->next - now, NSEC_PER_SEC / HZ) | ||||
| 		: 0; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(next_delay); | ||||
| 
 | ||||
| void bio_map(struct bio *bio, void *base) | ||||
| { | ||||
| 	size_t size = bio->bi_size; | ||||
| 	struct bio_vec *bv = bio->bi_io_vec; | ||||
| 
 | ||||
| 	BUG_ON(!bio->bi_size); | ||||
| 	BUG_ON(bio->bi_vcnt); | ||||
| 
 | ||||
| 	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; | ||||
| 	goto start; | ||||
| 
 | ||||
| 	for (; size; bio->bi_vcnt++, bv++) { | ||||
| 		bv->bv_offset	= 0; | ||||
| start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset, | ||||
| 					size); | ||||
| 		if (base) { | ||||
| 			bv->bv_page = is_vmalloc_addr(base) | ||||
| 				? vmalloc_to_page(base) | ||||
| 				: virt_to_page(base); | ||||
| 
 | ||||
| 			base += bv->bv_len; | ||||
| 		} | ||||
| 
 | ||||
| 		size -= bv->bv_len; | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(bio_map); | ||||
| 
 | ||||
| int bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||||
| { | ||||
| 	int i; | ||||
| 	struct bio_vec *bv; | ||||
| 
 | ||||
| 	bio_for_each_segment(bv, bio, i) { | ||||
| 		bv->bv_page = alloc_page(gfp); | ||||
| 		if (!bv->bv_page) { | ||||
| 			while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||||
| 				__free_page(bv->bv_page); | ||||
| 			return -ENOMEM; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(bio_alloc_pages); | ||||
| 
 | ||||
| /*
 | ||||
|  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | ||||
|  * use permitted, subject to terms of PostgreSQL license; see.) | ||||
| 
 | ||||
|  * If we have a 64-bit integer type, then a 64-bit CRC looks just like the | ||||
|  * usual sort of implementation. (See Ross Williams' excellent introduction | ||||
|  * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from | ||||
|  * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
 | ||||
|  * If we have no working 64-bit type, then fake it with two 32-bit registers. | ||||
|  * | ||||
|  * The present implementation is a normal (not "reflected", in Williams' | ||||
|  * terms) 64-bit CRC, using initial all-ones register contents and a final | ||||
|  * bit inversion. The chosen polynomial is borrowed from the DLT1 spec | ||||
|  * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
 | ||||
|  * | ||||
|  * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + | ||||
|  * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + | ||||
|  * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + | ||||
|  * x^7 + x^4 + x + 1 | ||||
| */ | ||||
| 
 | ||||
| static const uint64_t crc_table[256] = { | ||||
| 	0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26, | ||||
| 	0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C, | ||||
| 	0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE, | ||||
| 	0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B, | ||||
| 	0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847, | ||||
| 	0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C, | ||||
| 	0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930, | ||||
| 	0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285, | ||||
| 	0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177, | ||||
| 	0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D, | ||||
| 	0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE, | ||||
| 	0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B, | ||||
| 	0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27, | ||||
| 	0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3, | ||||
| 	0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF, | ||||
| 	0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A, | ||||
| 	0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317, | ||||
| 	0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D, | ||||
| 	0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F, | ||||
| 	0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A, | ||||
| 	0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676, | ||||
| 	0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC, | ||||
| 	0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0, | ||||
| 	0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645, | ||||
| 	0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7, | ||||
| 	0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD, | ||||
| 	0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0, | ||||
| 	0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75, | ||||
| 	0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539, | ||||
| 	0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED, | ||||
| 	0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1, | ||||
| 	0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14, | ||||
| 	0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7, | ||||
| 	0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD, | ||||
| 	0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F, | ||||
| 	0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA, | ||||
| 	0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6, | ||||
| 	0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D, | ||||
| 	0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1, | ||||
| 	0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874, | ||||
| 	0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86, | ||||
| 	0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC, | ||||
| 	0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD, | ||||
| 	0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78, | ||||
| 	0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534, | ||||
| 	0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0, | ||||
| 	0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC, | ||||
| 	0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19, | ||||
| 	0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04, | ||||
| 	0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E, | ||||
| 	0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C, | ||||
| 	0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329, | ||||
| 	0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965, | ||||
| 	0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980, | ||||
| 	0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC, | ||||
| 	0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879, | ||||
| 	0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B, | ||||
| 	0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1, | ||||
| 	0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC, | ||||
| 	0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149, | ||||
| 	0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05, | ||||
| 	0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1, | ||||
| 	0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D, | ||||
| 	0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428, | ||||
| 	0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4, | ||||
| 	0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE, | ||||
| 	0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C, | ||||
| 	0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9, | ||||
| 	0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5, | ||||
| 	0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E, | ||||
| 	0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2, | ||||
| 	0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767, | ||||
| 	0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495, | ||||
| 	0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF, | ||||
| 	0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C, | ||||
| 	0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589, | ||||
| 	0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5, | ||||
| 	0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111, | ||||
| 	0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D, | ||||
| 	0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8, | ||||
| 	0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5, | ||||
| 	0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F, | ||||
| 	0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D, | ||||
| 	0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8, | ||||
| 	0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394, | ||||
| 	0x9AFCE626CE85B507 | ||||
| }; | ||||
| 
 | ||||
| uint64_t crc64_update(uint64_t crc, const void *_data, size_t len) | ||||
| { | ||||
| 	const unsigned char *data = _data; | ||||
| 
 | ||||
| 	while (len--) { | ||||
| 		int i = ((int) (crc >> 56) ^ *data++) & 0xFF; | ||||
| 		crc = crc_table[i] ^ (crc << 8); | ||||
| 	} | ||||
| 
 | ||||
| 	return crc; | ||||
| } | ||||
| EXPORT_SYMBOL(crc64_update); | ||||
| 
 | ||||
| uint64_t crc64(const void *data, size_t len) | ||||
| { | ||||
| 	uint64_t crc = 0xffffffffffffffff; | ||||
| 
 | ||||
| 	crc = crc64_update(crc, data, len); | ||||
| 
 | ||||
| 	return crc ^ 0xffffffffffffffff; | ||||
| } | ||||
| EXPORT_SYMBOL(crc64); | ||||
							
								
								
									
										589
									
								
								drivers/md/bcache/util.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										589
									
								
								drivers/md/bcache/util.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,589 @@ | |||
| 
 | ||||
| #ifndef _BCACHE_UTIL_H | ||||
| #define _BCACHE_UTIL_H | ||||
| 
 | ||||
| #include <linux/errno.h> | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/llist.h> | ||||
| #include <linux/ratelimit.h> | ||||
| #include <linux/vmalloc.h> | ||||
| #include <linux/workqueue.h> | ||||
| 
 | ||||
| #include "closure.h" | ||||
| 
 | ||||
| #define PAGE_SECTORS		(PAGE_SIZE / 512) | ||||
| 
 | ||||
| struct closure; | ||||
| 
 | ||||
| #include <trace/events/bcache.h> | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_EDEBUG | ||||
| 
 | ||||
| #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0) | ||||
| #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i) | ||||
| 
 | ||||
| #else /* EDEBUG */ | ||||
| 
 | ||||
| #define atomic_dec_bug(v)	atomic_dec(v) | ||||
| #define atomic_inc_bug(v, i)	atomic_inc(v) | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #define BITMASK(name, type, field, offset, size)		\ | ||||
| static inline uint64_t name(const type *k)			\ | ||||
| { return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\ | ||||
| 								\ | ||||
| static inline void SET_##name(type *k, uint64_t v)		\ | ||||
| {								\ | ||||
| 	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\ | ||||
| 	k->field |= v << offset;				\ | ||||
| } | ||||
| 
 | ||||
| #define DECLARE_HEAP(type, name)					\ | ||||
| 	struct {							\ | ||||
| 		size_t size, used;					\ | ||||
| 		type *data;						\ | ||||
| 	} name | ||||
| 
 | ||||
| #define init_heap(heap, _size, gfp)					\ | ||||
| ({									\ | ||||
| 	size_t _bytes;							\ | ||||
| 	(heap)->used = 0;						\ | ||||
| 	(heap)->size = (_size);						\ | ||||
| 	_bytes = (heap)->size * sizeof(*(heap)->data);			\ | ||||
| 	(heap)->data = NULL;						\ | ||||
| 	if (_bytes < KMALLOC_MAX_SIZE)					\ | ||||
| 		(heap)->data = kmalloc(_bytes, (gfp));			\ | ||||
| 	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\ | ||||
| 		(heap)->data = vmalloc(_bytes);				\ | ||||
| 	(heap)->data;							\ | ||||
| }) | ||||
| 
 | ||||
| #define free_heap(heap)							\ | ||||
| do {									\ | ||||
| 	if (is_vmalloc_addr((heap)->data))				\ | ||||
| 		vfree((heap)->data);					\ | ||||
| 	else								\ | ||||
| 		kfree((heap)->data);					\ | ||||
| 	(heap)->data = NULL;						\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j]) | ||||
| 
 | ||||
| #define heap_sift(h, i, cmp)						\ | ||||
| do {									\ | ||||
| 	size_t _r, _j = i;						\ | ||||
| 									\ | ||||
| 	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\ | ||||
| 		_r = _j * 2 + 1;					\ | ||||
| 		if (_r + 1 < (h)->used &&				\ | ||||
| 		    cmp((h)->data[_r], (h)->data[_r + 1]))		\ | ||||
| 			_r++;						\ | ||||
| 									\ | ||||
| 		if (cmp((h)->data[_r], (h)->data[_j]))			\ | ||||
| 			break;						\ | ||||
| 		heap_swap(h, _r, _j);					\ | ||||
| 	}								\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define heap_sift_down(h, i, cmp)					\ | ||||
| do {									\ | ||||
| 	while (i) {							\ | ||||
| 		size_t p = (i - 1) / 2;					\ | ||||
| 		if (cmp((h)->data[i], (h)->data[p]))			\ | ||||
| 			break;						\ | ||||
| 		heap_swap(h, i, p);					\ | ||||
| 		i = p;							\ | ||||
| 	}								\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define heap_add(h, d, cmp)						\ | ||||
| ({									\ | ||||
| 	bool _r = !heap_full(h);					\ | ||||
| 	if (_r) {							\ | ||||
| 		size_t _i = (h)->used++;				\ | ||||
| 		(h)->data[_i] = d;					\ | ||||
| 									\ | ||||
| 		heap_sift_down(h, _i, cmp);				\ | ||||
| 		heap_sift(h, _i, cmp);					\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define heap_pop(h, d, cmp)						\ | ||||
| ({									\ | ||||
| 	bool _r = (h)->used;						\ | ||||
| 	if (_r) {							\ | ||||
| 		(d) = (h)->data[0];					\ | ||||
| 		(h)->used--;						\ | ||||
| 		heap_swap(h, 0, (h)->used);				\ | ||||
| 		heap_sift(h, 0, cmp);					\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define heap_peek(h)	((h)->size ? (h)->data[0] : NULL) | ||||
| 
 | ||||
| #define heap_full(h)	((h)->used == (h)->size) | ||||
| 
 | ||||
| #define DECLARE_FIFO(type, name)					\ | ||||
| 	struct {							\ | ||||
| 		size_t front, back, size, mask;				\ | ||||
| 		type *data;						\ | ||||
| 	} name | ||||
| 
 | ||||
| #define fifo_for_each(c, fifo, iter)					\ | ||||
| 	for (iter = (fifo)->front;					\ | ||||
| 	     c = (fifo)->data[iter], iter != (fifo)->back;		\ | ||||
| 	     iter = (iter + 1) & (fifo)->mask) | ||||
| 
 | ||||
| #define __init_fifo(fifo, gfp)						\ | ||||
| ({									\ | ||||
| 	size_t _allocated_size, _bytes;					\ | ||||
| 	BUG_ON(!(fifo)->size);						\ | ||||
| 									\ | ||||
| 	_allocated_size = roundup_pow_of_two((fifo)->size + 1);		\ | ||||
| 	_bytes = _allocated_size * sizeof(*(fifo)->data);		\ | ||||
| 									\ | ||||
| 	(fifo)->mask = _allocated_size - 1;				\ | ||||
| 	(fifo)->front = (fifo)->back = 0;				\ | ||||
| 	(fifo)->data = NULL;						\ | ||||
| 									\ | ||||
| 	if (_bytes < KMALLOC_MAX_SIZE)					\ | ||||
| 		(fifo)->data = kmalloc(_bytes, (gfp));			\ | ||||
| 	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\ | ||||
| 		(fifo)->data = vmalloc(_bytes);				\ | ||||
| 	(fifo)->data;							\ | ||||
| }) | ||||
| 
 | ||||
| #define init_fifo_exact(fifo, _size, gfp)				\ | ||||
| ({									\ | ||||
| 	(fifo)->size = (_size);						\ | ||||
| 	__init_fifo(fifo, gfp);						\ | ||||
| }) | ||||
| 
 | ||||
| #define init_fifo(fifo, _size, gfp)					\ | ||||
| ({									\ | ||||
| 	(fifo)->size = (_size);						\ | ||||
| 	if ((fifo)->size > 4)						\ | ||||
| 		(fifo)->size = roundup_pow_of_two((fifo)->size) - 1;	\ | ||||
| 	__init_fifo(fifo, gfp);						\ | ||||
| }) | ||||
| 
 | ||||
| #define free_fifo(fifo)							\ | ||||
| do {									\ | ||||
| 	if (is_vmalloc_addr((fifo)->data))				\ | ||||
| 		vfree((fifo)->data);					\ | ||||
| 	else								\ | ||||
| 		kfree((fifo)->data);					\ | ||||
| 	(fifo)->data = NULL;						\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define fifo_used(fifo)		(((fifo)->back - (fifo)->front) & (fifo)->mask) | ||||
| #define fifo_free(fifo)		((fifo)->size - fifo_used(fifo)) | ||||
| 
 | ||||
| #define fifo_empty(fifo)	(!fifo_used(fifo)) | ||||
| #define fifo_full(fifo)		(!fifo_free(fifo)) | ||||
| 
 | ||||
| #define fifo_front(fifo)	((fifo)->data[(fifo)->front]) | ||||
| #define fifo_back(fifo)							\ | ||||
| 	((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) | ||||
| 
 | ||||
| #define fifo_idx(fifo, p)	(((p) - &fifo_front(fifo)) & (fifo)->mask) | ||||
| 
 | ||||
| #define fifo_push_back(fifo, i)						\ | ||||
| ({									\ | ||||
| 	bool _r = !fifo_full((fifo));					\ | ||||
| 	if (_r) {							\ | ||||
| 		(fifo)->data[(fifo)->back++] = (i);			\ | ||||
| 		(fifo)->back &= (fifo)->mask;				\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define fifo_pop_front(fifo, i)						\ | ||||
| ({									\ | ||||
| 	bool _r = !fifo_empty((fifo));					\ | ||||
| 	if (_r) {							\ | ||||
| 		(i) = (fifo)->data[(fifo)->front++];			\ | ||||
| 		(fifo)->front &= (fifo)->mask;				\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define fifo_push_front(fifo, i)					\ | ||||
| ({									\ | ||||
| 	bool _r = !fifo_full((fifo));					\ | ||||
| 	if (_r) {							\ | ||||
| 		--(fifo)->front;					\ | ||||
| 		(fifo)->front &= (fifo)->mask;				\ | ||||
| 		(fifo)->data[(fifo)->front] = (i);			\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define fifo_pop_back(fifo, i)						\ | ||||
| ({									\ | ||||
| 	bool _r = !fifo_empty((fifo));					\ | ||||
| 	if (_r) {							\ | ||||
| 		--(fifo)->back;						\ | ||||
| 		(fifo)->back &= (fifo)->mask;				\ | ||||
| 		(i) = (fifo)->data[(fifo)->back]			\ | ||||
| 	}								\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define fifo_push(fifo, i)	fifo_push_back(fifo, (i)) | ||||
| #define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i)) | ||||
| 
 | ||||
| #define fifo_swap(l, r)							\ | ||||
| do {									\ | ||||
| 	swap((l)->front, (r)->front);					\ | ||||
| 	swap((l)->back, (r)->back);					\ | ||||
| 	swap((l)->size, (r)->size);					\ | ||||
| 	swap((l)->mask, (r)->mask);					\ | ||||
| 	swap((l)->data, (r)->data);					\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define fifo_move(dest, src)						\ | ||||
| do {									\ | ||||
| 	typeof(*((dest)->data)) _t;					\ | ||||
| 	while (!fifo_full(dest) &&					\ | ||||
| 	       fifo_pop(src, _t))					\ | ||||
| 		fifo_push(dest, _t);					\ | ||||
| } while (0) | ||||
| 
 | ||||
| /*
 | ||||
|  * Simple array based allocator - preallocates a number of elements and you can | ||||
|  * never allocate more than that, also has no locking. | ||||
|  * | ||||
|  * Handy because if you know you only need a fixed number of elements you don't | ||||
|  * have to worry about memory allocation failure, and sometimes a mempool isn't | ||||
|  * what you want. | ||||
|  * | ||||
|  * We treat the free elements as entries in a singly linked list, and the | ||||
|  * freelist as a stack - allocating and freeing push and pop off the freelist. | ||||
|  */ | ||||
| 
 | ||||
| #define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\ | ||||
| 	struct {							\ | ||||
| 		type	*freelist;					\ | ||||
| 		type	data[size];					\ | ||||
| 	} name | ||||
| 
 | ||||
| #define array_alloc(array)						\ | ||||
| ({									\ | ||||
| 	typeof((array)->freelist) _ret = (array)->freelist;		\ | ||||
| 									\ | ||||
| 	if (_ret)							\ | ||||
| 		(array)->freelist = *((typeof((array)->freelist) *) _ret);\ | ||||
| 									\ | ||||
| 	_ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define array_free(array, ptr)						\ | ||||
| do {									\ | ||||
| 	typeof((array)->freelist) _ptr = ptr;				\ | ||||
| 									\ | ||||
| 	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\ | ||||
| 	(array)->freelist = _ptr;					\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define array_allocator_init(array)					\ | ||||
| do {									\ | ||||
| 	typeof((array)->freelist) _i;					\ | ||||
| 									\ | ||||
| 	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\ | ||||
| 	(array)->freelist = NULL;					\ | ||||
| 									\ | ||||
| 	for (_i = (array)->data;					\ | ||||
| 	     _i < (array)->data + ARRAY_SIZE((array)->data);		\ | ||||
| 	     _i++)							\ | ||||
| 		array_free(array, _i);					\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define array_freelist_empty(array)	((array)->freelist == NULL) | ||||
| 
 | ||||
| #define ANYSINT_MAX(t)							\ | ||||
| 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) | ||||
| 
 | ||||
| int strtoint_h(const char *, int *); | ||||
| int strtouint_h(const char *, unsigned int *); | ||||
| int strtoll_h(const char *, long long *); | ||||
| int strtoull_h(const char *, unsigned long long *); | ||||
| 
 | ||||
| static inline int strtol_h(const char *cp, long *res) | ||||
| { | ||||
| #if BITS_PER_LONG == 32 | ||||
| 	return strtoint_h(cp, (int *) res); | ||||
| #else | ||||
| 	return strtoll_h(cp, (long long *) res); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline int strtoul_h(const char *cp, long *res) | ||||
| { | ||||
| #if BITS_PER_LONG == 32 | ||||
| 	return strtouint_h(cp, (unsigned int *) res); | ||||
| #else | ||||
| 	return strtoull_h(cp, (unsigned long long *) res); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #define strtoi_h(cp, res)						\ | ||||
| 	(__builtin_types_compatible_p(typeof(*res), int)		\ | ||||
| 	? strtoint_h(cp, (void *) res)					\ | ||||
| 	: __builtin_types_compatible_p(typeof(*res), long)		\ | ||||
| 	? strtol_h(cp, (void *) res)					\ | ||||
| 	: __builtin_types_compatible_p(typeof(*res), long long)		\ | ||||
| 	? strtoll_h(cp, (void *) res)					\ | ||||
| 	: __builtin_types_compatible_p(typeof(*res), unsigned int)	\ | ||||
| 	? strtouint_h(cp, (void *) res)					\ | ||||
| 	: __builtin_types_compatible_p(typeof(*res), unsigned long)	\ | ||||
| 	? strtoul_h(cp, (void *) res)					\ | ||||
| 	: __builtin_types_compatible_p(typeof(*res), unsigned long long)\ | ||||
| 	? strtoull_h(cp, (void *) res) : -EINVAL) | ||||
| 
 | ||||
| #define strtoul_safe(cp, var)						\ | ||||
| ({									\ | ||||
| 	unsigned long _v;						\ | ||||
| 	int _r = kstrtoul(cp, 10, &_v);					\ | ||||
| 	if (!_r)							\ | ||||
| 		var = _v;						\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define strtoul_safe_clamp(cp, var, min, max)				\ | ||||
| ({									\ | ||||
| 	unsigned long _v;						\ | ||||
| 	int _r = kstrtoul(cp, 10, &_v);					\ | ||||
| 	if (!_r)							\ | ||||
| 		var = clamp_t(typeof(var), _v, min, max);		\ | ||||
| 	_r;								\ | ||||
| }) | ||||
| 
 | ||||
| #define snprint(buf, size, var)						\ | ||||
| 	snprintf(buf, size,						\ | ||||
| 		__builtin_types_compatible_p(typeof(var), int)		\ | ||||
| 		     ? "%i\n" :						\ | ||||
| 		__builtin_types_compatible_p(typeof(var), unsigned)	\ | ||||
| 		     ? "%u\n" :						\ | ||||
| 		__builtin_types_compatible_p(typeof(var), long)		\ | ||||
| 		     ? "%li\n" :					\ | ||||
| 		__builtin_types_compatible_p(typeof(var), unsigned long)\ | ||||
| 		     ? "%lu\n" :					\ | ||||
| 		__builtin_types_compatible_p(typeof(var), int64_t)	\ | ||||
| 		     ? "%lli\n" :					\ | ||||
| 		__builtin_types_compatible_p(typeof(var), uint64_t)	\ | ||||
| 		     ? "%llu\n" :					\ | ||||
| 		__builtin_types_compatible_p(typeof(var), const char *)	\ | ||||
| 		     ? "%s\n" : "%i\n", var) | ||||
| 
 | ||||
| ssize_t hprint(char *buf, int64_t v); | ||||
| 
 | ||||
| bool is_zero(const char *p, size_t n); | ||||
| int parse_uuid(const char *s, char *uuid); | ||||
| 
 | ||||
| ssize_t snprint_string_list(char *buf, size_t size, const char * const list[], | ||||
| 			    size_t selected); | ||||
| 
 | ||||
| ssize_t read_string_list(const char *buf, const char * const list[]); | ||||
| 
 | ||||
| struct time_stats { | ||||
| 	/*
 | ||||
| 	 * all fields are in nanoseconds, averages are ewmas stored left shifted | ||||
| 	 * by 8 | ||||
| 	 */ | ||||
| 	uint64_t	max_duration; | ||||
| 	uint64_t	average_duration; | ||||
| 	uint64_t	average_frequency; | ||||
| 	uint64_t	last; | ||||
| }; | ||||
| 
 | ||||
| void time_stats_update(struct time_stats *stats, uint64_t time); | ||||
| 
 | ||||
| #define NSEC_PER_ns			1L | ||||
| #define NSEC_PER_us			NSEC_PER_USEC | ||||
| #define NSEC_PER_ms			NSEC_PER_MSEC | ||||
| #define NSEC_PER_sec			NSEC_PER_SEC | ||||
| 
 | ||||
| #define __print_time_stat(stats, name, stat, units)			\ | ||||
| 	sysfs_print(name ## _ ## stat ## _ ## units,			\ | ||||
| 		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) | ||||
| 
 | ||||
| #define sysfs_print_time_stats(stats, name,				\ | ||||
| 			       frequency_units,				\ | ||||
| 			       duration_units)				\ | ||||
| do {									\ | ||||
| 	__print_time_stat(stats, name,					\ | ||||
| 			  average_frequency,	frequency_units);	\ | ||||
| 	__print_time_stat(stats, name,					\ | ||||
| 			  average_duration,	duration_units);	\ | ||||
| 	__print_time_stat(stats, name,					\ | ||||
| 			  max_duration,		duration_units);	\ | ||||
| 									\ | ||||
| 	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\ | ||||
| 		    ? div_s64(local_clock() - (stats)->last,		\ | ||||
| 			      NSEC_PER_ ## frequency_units)		\ | ||||
| 		    : -1LL);						\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define sysfs_time_stats_attribute(name,				\ | ||||
| 				   frequency_units,			\ | ||||
| 				   duration_units)			\ | ||||
| read_attribute(name ## _average_frequency_ ## frequency_units);		\ | ||||
| read_attribute(name ## _average_duration_ ## duration_units);		\ | ||||
| read_attribute(name ## _max_duration_ ## duration_units);		\ | ||||
| read_attribute(name ## _last_ ## frequency_units) | ||||
| 
 | ||||
| #define sysfs_time_stats_attribute_list(name,				\ | ||||
| 					frequency_units,		\ | ||||
| 					duration_units)			\ | ||||
| &sysfs_ ## name ## _average_frequency_ ## frequency_units,		\ | ||||
| &sysfs_ ## name ## _average_duration_ ## duration_units,		\ | ||||
| &sysfs_ ## name ## _max_duration_ ## duration_units,			\ | ||||
| &sysfs_ ## name ## _last_ ## frequency_units, | ||||
| 
 | ||||
| #define ewma_add(ewma, val, weight, factor)				\ | ||||
| ({									\ | ||||
| 	(ewma) *= (weight) - 1;						\ | ||||
| 	(ewma) += (val) << factor;					\ | ||||
| 	(ewma) /= (weight);						\ | ||||
| 	(ewma) >> factor;						\ | ||||
| }) | ||||
| 
 | ||||
| struct ratelimit { | ||||
| 	uint64_t		next; | ||||
| 	unsigned		rate; | ||||
| }; | ||||
| 
 | ||||
| static inline void ratelimit_reset(struct ratelimit *d) | ||||
| { | ||||
| 	d->next = local_clock(); | ||||
| } | ||||
| 
 | ||||
| unsigned next_delay(struct ratelimit *d, uint64_t done); | ||||
| 
 | ||||
| #define __DIV_SAFE(n, d, zero)						\ | ||||
| ({									\ | ||||
| 	typeof(n) _n = (n);						\ | ||||
| 	typeof(d) _d = (d);						\ | ||||
| 	_d ? _n / _d : zero;						\ | ||||
| }) | ||||
| 
 | ||||
| #define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0) | ||||
| 
 | ||||
| #define container_of_or_null(ptr, type, member)				\ | ||||
| ({									\ | ||||
| 	typeof(ptr) _ptr = ptr;						\ | ||||
| 	_ptr ? container_of(_ptr, type, member) : NULL;			\ | ||||
| }) | ||||
| 
 | ||||
| #define RB_INSERT(root, new, member, cmp)				\ | ||||
| ({									\ | ||||
| 	__label__ dup;							\ | ||||
| 	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\ | ||||
| 	typeof(new) this;						\ | ||||
| 	int res, ret = -1;						\ | ||||
| 									\ | ||||
| 	while (*n) {							\ | ||||
| 		parent = *n;						\ | ||||
| 		this = container_of(*n, typeof(*(new)), member);	\ | ||||
| 		res = cmp(new, this);					\ | ||||
| 		if (!res)						\ | ||||
| 			goto dup;					\ | ||||
| 		n = res < 0						\ | ||||
| 			? &(*n)->rb_left				\ | ||||
| 			: &(*n)->rb_right;				\ | ||||
| 	}								\ | ||||
| 									\ | ||||
| 	rb_link_node(&(new)->member, parent, n);			\ | ||||
| 	rb_insert_color(&(new)->member, root);				\ | ||||
| 	ret = 0;							\ | ||||
| dup:									\ | ||||
| 	ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define RB_SEARCH(root, search, member, cmp)				\ | ||||
| ({									\ | ||||
| 	struct rb_node *n = (root)->rb_node;				\ | ||||
| 	typeof(&(search)) this, ret = NULL;				\ | ||||
| 	int res;							\ | ||||
| 									\ | ||||
| 	while (n) {							\ | ||||
| 		this = container_of(n, typeof(search), member);		\ | ||||
| 		res = cmp(&(search), this);				\ | ||||
| 		if (!res) {						\ | ||||
| 			ret = this;					\ | ||||
| 			break;						\ | ||||
| 		}							\ | ||||
| 		n = res < 0						\ | ||||
| 			? n->rb_left					\ | ||||
| 			: n->rb_right;					\ | ||||
| 	}								\ | ||||
| 	ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define RB_GREATER(root, search, member, cmp)				\ | ||||
| ({									\ | ||||
| 	struct rb_node *n = (root)->rb_node;				\ | ||||
| 	typeof(&(search)) this, ret = NULL;				\ | ||||
| 	int res;							\ | ||||
| 									\ | ||||
| 	while (n) {							\ | ||||
| 		this = container_of(n, typeof(search), member);		\ | ||||
| 		res = cmp(&(search), this);				\ | ||||
| 		if (res < 0) {						\ | ||||
| 			ret = this;					\ | ||||
| 			n = n->rb_left;					\ | ||||
| 		} else							\ | ||||
| 			n = n->rb_right;				\ | ||||
| 	}								\ | ||||
| 	ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define RB_FIRST(root, type, member)					\ | ||||
| 	container_of_or_null(rb_first(root), type, member) | ||||
| 
 | ||||
| #define RB_LAST(root, type, member)					\ | ||||
| 	container_of_or_null(rb_last(root), type, member) | ||||
| 
 | ||||
| #define RB_NEXT(ptr, member)						\ | ||||
| 	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) | ||||
| 
 | ||||
| #define RB_PREV(ptr, member)						\ | ||||
| 	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) | ||||
| 
 | ||||
| /* Does linear interpolation between powers of two */ | ||||
| static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | ||||
| { | ||||
| 	unsigned fract = x & ~(~0 << fract_bits); | ||||
| 
 | ||||
| 	x >>= fract_bits; | ||||
| 	x   = 1 << x; | ||||
| 	x  += (x * fract) >> fract_bits; | ||||
| 
 | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| #define bio_end(bio)	((bio)->bi_sector + bio_sectors(bio)) | ||||
| 
 | ||||
| void bio_map(struct bio *bio, void *base); | ||||
| 
 | ||||
| int bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||||
| 
 | ||||
| static inline sector_t bdev_sectors(struct block_device *bdev) | ||||
| { | ||||
| 	return bdev->bd_inode->i_size >> 9; | ||||
| } | ||||
| 
 | ||||
| #define closure_bio_submit(bio, cl, dev)				\ | ||||
| do {									\ | ||||
| 	closure_get(cl);						\ | ||||
| 	bch_generic_make_request(bio, &(dev)->bio_split_hook);		\ | ||||
| } while (0) | ||||
| 
 | ||||
| uint64_t crc64_update(uint64_t, const void *, size_t); | ||||
| uint64_t crc64(const void *, size_t); | ||||
| 
 | ||||
| #endif /* _BCACHE_UTIL_H */ | ||||
							
								
								
									
										414
									
								
								drivers/md/bcache/writeback.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										414
									
								
								drivers/md/bcache/writeback.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,414 @@ | |||
| /*
 | ||||
|  * background writeback - scan btree for dirty data and write it to the backing | ||||
|  * device | ||||
|  * | ||||
|  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * Copyright 2012 Google, Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| 
 | ||||
| static struct workqueue_struct *dirty_wq; | ||||
| 
 | ||||
| static void read_dirty(struct closure *); | ||||
| 
 | ||||
| struct dirty_io { | ||||
| 	struct closure		cl; | ||||
| 	struct cached_dev	*dc; | ||||
| 	struct bio		bio; | ||||
| }; | ||||
| 
 | ||||
| /* Rate limiting */ | ||||
| 
 | ||||
| static void __update_writeback_rate(struct cached_dev *dc) | ||||
| { | ||||
| 	struct cache_set *c = dc->disk.c; | ||||
| 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | ||||
| 	uint64_t cache_dirty_target = | ||||
| 		div_u64(cache_sectors * dc->writeback_percent, 100); | ||||
| 
 | ||||
| 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | ||||
| 				   c->cached_dev_sectors); | ||||
| 
 | ||||
| 	/* PD controller */ | ||||
| 
 | ||||
| 	int change = 0; | ||||
| 	int64_t error; | ||||
| 	int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | ||||
| 	int64_t derivative = dirty - dc->disk.sectors_dirty_last; | ||||
| 
 | ||||
| 	dc->disk.sectors_dirty_last = dirty; | ||||
| 
 | ||||
| 	derivative *= dc->writeback_rate_d_term; | ||||
| 	derivative = clamp(derivative, -dirty, dirty); | ||||
| 
 | ||||
| 	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | ||||
| 			      dc->writeback_rate_d_smooth, 0); | ||||
| 
 | ||||
| 	/* Avoid divide by zero */ | ||||
| 	if (!target) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	error = div64_s64((dirty + derivative - target) << 8, target); | ||||
| 
 | ||||
| 	change = div_s64((dc->writeback_rate.rate * error) >> 8, | ||||
| 			 dc->writeback_rate_p_term_inverse); | ||||
| 
 | ||||
| 	/* Don't increase writeback rate if the device isn't keeping up */ | ||||
| 	if (change > 0 && | ||||
| 	    time_after64(local_clock(), | ||||
| 			 dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | ||||
| 		change = 0; | ||||
| 
 | ||||
| 	dc->writeback_rate.rate = | ||||
| 		clamp_t(int64_t, dc->writeback_rate.rate + change, | ||||
| 			1, NSEC_PER_MSEC); | ||||
| out: | ||||
| 	dc->writeback_rate_derivative = derivative; | ||||
| 	dc->writeback_rate_change = change; | ||||
| 	dc->writeback_rate_target = target; | ||||
| 
 | ||||
| 	schedule_delayed_work(&dc->writeback_rate_update, | ||||
| 			      dc->writeback_rate_update_seconds * HZ); | ||||
| } | ||||
| 
 | ||||
| static void update_writeback_rate(struct work_struct *work) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(to_delayed_work(work), | ||||
| 					     struct cached_dev, | ||||
| 					     writeback_rate_update); | ||||
| 
 | ||||
| 	down_read(&dc->writeback_lock); | ||||
| 
 | ||||
| 	if (atomic_read(&dc->has_dirty) && | ||||
| 	    dc->writeback_percent) | ||||
| 		__update_writeback_rate(dc); | ||||
| 
 | ||||
| 	up_read(&dc->writeback_lock); | ||||
| } | ||||
| 
 | ||||
| static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | ||||
| { | ||||
| 	if (atomic_read(&dc->disk.detaching) || | ||||
| 	    !dc->writeback_percent) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return next_delay(&dc->writeback_rate, sectors * 10000000ULL); | ||||
| } | ||||
| 
 | ||||
| /* Background writeback */ | ||||
| 
 | ||||
| static bool dirty_pred(struct keybuf *buf, struct bkey *k) | ||||
| { | ||||
| 	return KEY_DIRTY(k); | ||||
| } | ||||
| 
 | ||||
| static void dirty_init(struct keybuf_key *w) | ||||
| { | ||||
| 	struct dirty_io *io = w->private; | ||||
| 	struct bio *bio = &io->bio; | ||||
| 
 | ||||
| 	bio_init(bio); | ||||
| 	if (!io->dc->writeback_percent) | ||||
| 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||||
| 
 | ||||
| 	bio->bi_size		= KEY_SIZE(&w->key) << 9; | ||||
| 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | ||||
| 	bio->bi_private		= w; | ||||
| 	bio->bi_io_vec		= bio->bi_inline_vecs; | ||||
| 	bio_map(bio, NULL); | ||||
| } | ||||
| 
 | ||||
| static void refill_dirty(struct closure *cl) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(cl, struct cached_dev, | ||||
| 					     writeback.cl); | ||||
| 	struct keybuf *buf = &dc->writeback_keys; | ||||
| 	bool searched_from_start = false; | ||||
| 	struct bkey end = MAX_KEY; | ||||
| 	SET_KEY_INODE(&end, dc->disk.id); | ||||
| 
 | ||||
| 	if (!atomic_read(&dc->disk.detaching) && | ||||
| 	    !dc->writeback_running) | ||||
| 		closure_return(cl); | ||||
| 
 | ||||
| 	down_write(&dc->writeback_lock); | ||||
| 
 | ||||
| 	if (!atomic_read(&dc->has_dirty)) { | ||||
| 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||||
| 		bch_write_bdev_super(dc, NULL); | ||||
| 
 | ||||
| 		up_write(&dc->writeback_lock); | ||||
| 		closure_return(cl); | ||||
| 	} | ||||
| 
 | ||||
| 	if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||||
| 		buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||||
| 		searched_from_start = true; | ||||
| 	} | ||||
| 
 | ||||
| 	bch_refill_keybuf(dc->disk.c, buf, &end); | ||||
| 
 | ||||
| 	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | ||||
| 		/* Searched the entire btree  - delay awhile */ | ||||
| 
 | ||||
| 		if (RB_EMPTY_ROOT(&buf->keys)) { | ||||
| 			atomic_set(&dc->has_dirty, 0); | ||||
| 			cached_dev_put(dc); | ||||
| 		} | ||||
| 
 | ||||
| 		if (!atomic_read(&dc->disk.detaching)) | ||||
| 			closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||||
| 	} | ||||
| 
 | ||||
| 	up_write(&dc->writeback_lock); | ||||
| 
 | ||||
| 	ratelimit_reset(&dc->writeback_rate); | ||||
| 
 | ||||
| 	/* Punt to workqueue only so we don't recurse and blow the stack */ | ||||
| 	continue_at(cl, read_dirty, dirty_wq); | ||||
| } | ||||
| 
 | ||||
| void bch_writeback_queue(struct cached_dev *dc) | ||||
| { | ||||
| 	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | ||||
| 		if (!atomic_read(&dc->disk.detaching)) | ||||
| 			closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||||
| 
 | ||||
| 		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | ||||
| { | ||||
| 	atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||||
| 
 | ||||
| 	if (!atomic_read(&dc->has_dirty) && | ||||
| 	    !atomic_xchg(&dc->has_dirty, 1)) { | ||||
| 		atomic_inc(&dc->count); | ||||
| 
 | ||||
| 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||||
| 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||||
| 			/* XXX: should do this synchronously */ | ||||
| 			bch_write_bdev_super(dc, NULL); | ||||
| 		} | ||||
| 
 | ||||
| 		bch_writeback_queue(dc); | ||||
| 
 | ||||
| 		if (dc->writeback_percent) | ||||
| 			schedule_delayed_work(&dc->writeback_rate_update, | ||||
| 				      dc->writeback_rate_update_seconds * HZ); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /* Background writeback - IO loop */ | ||||
| 
 | ||||
| static void dirty_io_destructor(struct closure *cl) | ||||
| { | ||||
| 	struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||||
| 	kfree(io); | ||||
| } | ||||
| 
 | ||||
| static void write_dirty_finish(struct closure *cl) | ||||
| { | ||||
| 	struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||||
| 	struct keybuf_key *w = io->bio.bi_private; | ||||
| 	struct cached_dev *dc = io->dc; | ||||
| 	struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | ||||
| 
 | ||||
| 	while (bv-- != io->bio.bi_io_vec) | ||||
| 		__free_page(bv->bv_page); | ||||
| 
 | ||||
| 	/* This is kind of a dumb way of signalling errors. */ | ||||
| 	if (KEY_DIRTY(&w->key)) { | ||||
| 		unsigned i; | ||||
| 		struct btree_op op; | ||||
| 		bch_btree_op_init_stack(&op); | ||||
| 
 | ||||
| 		op.type = BTREE_REPLACE; | ||||
| 		bkey_copy(&op.replace, &w->key); | ||||
| 
 | ||||
| 		SET_KEY_DIRTY(&w->key, false); | ||||
| 		bch_keylist_add(&op.keys, &w->key); | ||||
| 
 | ||||
| 		for (i = 0; i < KEY_PTRS(&w->key); i++) | ||||
| 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | ||||
| 
 | ||||
| 		pr_debug("clearing %s", pkey(&w->key)); | ||||
| 		bch_btree_insert(&op, dc->disk.c); | ||||
| 		closure_sync(&op.cl); | ||||
| 
 | ||||
| 		atomic_long_inc(op.insert_collision | ||||
| 				? &dc->disk.c->writeback_keys_failed | ||||
| 				: &dc->disk.c->writeback_keys_done); | ||||
| 	} | ||||
| 
 | ||||
| 	bch_keybuf_del(&dc->writeback_keys, w); | ||||
| 	atomic_dec_bug(&dc->in_flight); | ||||
| 
 | ||||
| 	closure_wake_up(&dc->writeback_wait); | ||||
| 
 | ||||
| 	closure_return_with_destructor(cl, dirty_io_destructor); | ||||
| } | ||||
| 
 | ||||
| static void dirty_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct keybuf_key *w = bio->bi_private; | ||||
| 	struct dirty_io *io = w->private; | ||||
| 
 | ||||
| 	if (error) | ||||
| 		SET_KEY_DIRTY(&w->key, false); | ||||
| 
 | ||||
| 	closure_put(&io->cl); | ||||
| } | ||||
| 
 | ||||
| static void write_dirty(struct closure *cl) | ||||
| { | ||||
| 	struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||||
| 	struct keybuf_key *w = io->bio.bi_private; | ||||
| 
 | ||||
| 	dirty_init(w); | ||||
| 	io->bio.bi_rw		= WRITE; | ||||
| 	io->bio.bi_sector	= KEY_START(&w->key); | ||||
| 	io->bio.bi_bdev		= io->dc->bdev; | ||||
| 	io->bio.bi_end_io	= dirty_endio; | ||||
| 
 | ||||
| 	trace_bcache_write_dirty(&io->bio); | ||||
| 	closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||||
| 
 | ||||
| 	continue_at(cl, write_dirty_finish, dirty_wq); | ||||
| } | ||||
| 
 | ||||
| static void read_dirty_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct keybuf_key *w = bio->bi_private; | ||||
| 	struct dirty_io *io = w->private; | ||||
| 
 | ||||
| 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | ||||
| 			    error, "reading dirty data from cache"); | ||||
| 
 | ||||
| 	dirty_endio(bio, error); | ||||
| } | ||||
| 
 | ||||
| static void read_dirty_submit(struct closure *cl) | ||||
| { | ||||
| 	struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||||
| 
 | ||||
| 	trace_bcache_read_dirty(&io->bio); | ||||
| 	closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||||
| 
 | ||||
| 	continue_at(cl, write_dirty, dirty_wq); | ||||
| } | ||||
| 
 | ||||
| static void read_dirty(struct closure *cl) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(cl, struct cached_dev, | ||||
| 					     writeback.cl); | ||||
| 	unsigned delay = writeback_delay(dc, 0); | ||||
| 	struct keybuf_key *w; | ||||
| 	struct dirty_io *io; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * XXX: if we error, background writeback just spins. Should use some | ||||
| 	 * mempools. | ||||
| 	 */ | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		w = bch_keybuf_next(&dc->writeback_keys); | ||||
| 		if (!w) | ||||
| 			break; | ||||
| 
 | ||||
| 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | ||||
| 
 | ||||
| 		if (delay > 0 && | ||||
| 		    (KEY_START(&w->key) != dc->last_read || | ||||
| 		     jiffies_to_msecs(delay) > 50)) { | ||||
| 			w->private = NULL; | ||||
| 
 | ||||
| 			closure_delay(&dc->writeback, delay); | ||||
| 			continue_at(cl, read_dirty, dirty_wq); | ||||
| 		} | ||||
| 
 | ||||
| 		dc->last_read	= KEY_OFFSET(&w->key); | ||||
| 
 | ||||
| 		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | ||||
| 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||||
| 			     GFP_KERNEL); | ||||
| 		if (!io) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		w->private	= io; | ||||
| 		io->dc		= dc; | ||||
| 
 | ||||
| 		dirty_init(w); | ||||
| 		io->bio.bi_sector	= PTR_OFFSET(&w->key, 0); | ||||
| 		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c, | ||||
| 						    &w->key, 0)->bdev; | ||||
| 		io->bio.bi_rw		= READ; | ||||
| 		io->bio.bi_end_io	= read_dirty_endio; | ||||
| 
 | ||||
| 		if (bio_alloc_pages(&io->bio, GFP_KERNEL)) | ||||
| 			goto err_free; | ||||
| 
 | ||||
| 		pr_debug("%s", pkey(&w->key)); | ||||
| 
 | ||||
| 		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | ||||
| 
 | ||||
| 		delay = writeback_delay(dc, KEY_SIZE(&w->key)); | ||||
| 
 | ||||
| 		atomic_inc(&dc->in_flight); | ||||
| 
 | ||||
| 		if (!closure_wait_event(&dc->writeback_wait, cl, | ||||
| 					atomic_read(&dc->in_flight) < 64)) | ||||
| 			continue_at(cl, read_dirty, dirty_wq); | ||||
| 	} | ||||
| 
 | ||||
| 	if (0) { | ||||
| err_free: | ||||
| 		kfree(w->private); | ||||
| err: | ||||
| 		bch_keybuf_del(&dc->writeback_keys, w); | ||||
| 	} | ||||
| 
 | ||||
| 	refill_dirty(cl); | ||||
| } | ||||
| 
 | ||||
| void bch_writeback_init_cached_dev(struct cached_dev *dc) | ||||
| { | ||||
| 	closure_init_unlocked(&dc->writeback); | ||||
| 	init_rwsem(&dc->writeback_lock); | ||||
| 
 | ||||
| 	bch_keybuf_init(&dc->writeback_keys, dirty_pred); | ||||
| 
 | ||||
| 	dc->writeback_metadata		= true; | ||||
| 	dc->writeback_running		= true; | ||||
| 	dc->writeback_percent		= 10; | ||||
| 	dc->writeback_delay		= 30; | ||||
| 	dc->writeback_rate.rate		= 1024; | ||||
| 
 | ||||
| 	dc->writeback_rate_update_seconds = 30; | ||||
| 	dc->writeback_rate_d_term	= 16; | ||||
| 	dc->writeback_rate_p_term_inverse = 64; | ||||
| 	dc->writeback_rate_d_smooth	= 8; | ||||
| 
 | ||||
| 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | ||||
| 	schedule_delayed_work(&dc->writeback_rate_update, | ||||
| 			      dc->writeback_rate_update_seconds * HZ); | ||||
| } | ||||
| 
 | ||||
| void bch_writeback_exit(void) | ||||
| { | ||||
| 	if (dirty_wq) | ||||
| 		destroy_workqueue(dirty_wq); | ||||
| } | ||||
| 
 | ||||
| int __init bch_writeback_init(void) | ||||
| { | ||||
| 	dirty_wq = create_singlethread_workqueue("bcache_writeback"); | ||||
| 	if (!dirty_wq) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
|  | @ -78,3 +78,9 @@ SUBSYS(hugetlb) | |||
| #endif | ||||
| 
 | ||||
| /* */ | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUP_BCACHE | ||||
| SUBSYS(bcache) | ||||
| #endif | ||||
| 
 | ||||
| /* */ | ||||
|  |  | |||
|  | @ -1576,6 +1576,10 @@ struct task_struct { | |||
| #ifdef CONFIG_UPROBES | ||||
| 	struct uprobe_task *utask; | ||||
| #endif | ||||
| #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) | ||||
| 	unsigned int	sequential_io; | ||||
| 	unsigned int	sequential_io_avg; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| /* Future-safe accessor for struct task_struct's cpus_allowed. */ | ||||
|  |  | |||
							
								
								
									
										271
									
								
								include/trace/events/bcache.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										271
									
								
								include/trace/events/bcache.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,271 @@ | |||
| #undef TRACE_SYSTEM | ||||
| #define TRACE_SYSTEM bcache | ||||
| 
 | ||||
| #if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) | ||||
| #define _TRACE_BCACHE_H | ||||
| 
 | ||||
| #include <linux/tracepoint.h> | ||||
| 
 | ||||
| struct search; | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(bcache_request, | ||||
| 
 | ||||
| 	TP_PROTO(struct search *s, struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(s, bio), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t,		dev			) | ||||
| 		__field(unsigned int,	orig_major		) | ||||
| 		__field(unsigned int,	orig_minor		) | ||||
| 		__field(sector_t,	sector			) | ||||
| 		__field(dev_t,		orig_sector		) | ||||
| 		__field(unsigned int,	nr_sector		) | ||||
| 		__array(char,		rwbs,	6		) | ||||
| 		__array(char,		comm,	TASK_COMM_LEN	) | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->dev		= bio->bi_bdev->bd_dev; | ||||
| 		__entry->orig_major	= s->d->disk->major; | ||||
| 		__entry->orig_minor	= s->d->disk->first_minor; | ||||
| 		__entry->sector		= bio->bi_sector; | ||||
| 		__entry->orig_sector	= bio->bi_sector - 16; | ||||
| 		__entry->nr_sector	= bio->bi_size >> 9; | ||||
| 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||||
| 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rwbs, | ||||
| 		  (unsigned long long)__entry->sector, | ||||
| 		  __entry->nr_sector, __entry->comm, | ||||
| 		  __entry->orig_major, __entry->orig_minor, | ||||
| 		  (unsigned long long)__entry->orig_sector) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_request, bcache_request_start, | ||||
| 
 | ||||
| 	TP_PROTO(struct search *s, struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(s, bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_request, bcache_request_end, | ||||
| 
 | ||||
| 	TP_PROTO(struct search *s, struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(s, bio) | ||||
| ); | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(bcache_bio, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t,		dev			) | ||||
| 		__field(sector_t,	sector			) | ||||
| 		__field(unsigned int,	nr_sector		) | ||||
| 		__array(char,		rwbs,	6		) | ||||
| 		__array(char,		comm,	TASK_COMM_LEN	) | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->dev		= bio->bi_bdev->bd_dev; | ||||
| 		__entry->sector		= bio->bi_sector; | ||||
| 		__entry->nr_sector	= bio->bi_size >> 9; | ||||
| 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||||
| 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("%d,%d  %s %llu + %u [%s]", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rwbs, | ||||
| 		  (unsigned long long)__entry->sector, | ||||
| 		  __entry->nr_sector, __entry->comm) | ||||
| ); | ||||
| 
 | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_passthrough, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_cache_hit, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_cache_miss, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_read_retry, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_writethrough, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_writeback, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_write_skip, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_btree_read, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_btree_write, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_write_dirty, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_read_dirty, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_write_moving, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_read_moving, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_bio, bcache_journal_write, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio), | ||||
| 
 | ||||
| 	TP_ARGS(bio) | ||||
| ); | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(bcache_cache_bio, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio, | ||||
| 		 sector_t orig_sector, | ||||
| 		 struct block_device* orig_bdev), | ||||
| 
 | ||||
| 	TP_ARGS(bio, orig_sector, orig_bdev), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t,		dev			) | ||||
| 		__field(dev_t,		orig_dev		) | ||||
| 		__field(sector_t,	sector			) | ||||
| 		__field(sector_t,	orig_sector		) | ||||
| 		__field(unsigned int,	nr_sector		) | ||||
| 		__array(char,		rwbs,	6		) | ||||
| 		__array(char,		comm,	TASK_COMM_LEN	) | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->dev		= bio->bi_bdev->bd_dev; | ||||
| 		__entry->orig_dev	= orig_bdev->bd_dev; | ||||
| 		__entry->sector		= bio->bi_sector; | ||||
| 		__entry->orig_sector	= orig_sector; | ||||
| 		__entry->nr_sector	= bio->bi_size >> 9; | ||||
| 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||||
| 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("%d,%d  %s %llu + %u [%s] (from %d,%d %llu)", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rwbs, | ||||
| 		  (unsigned long long)__entry->sector, | ||||
| 		  __entry->nr_sector, __entry->comm, | ||||
| 		  MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), | ||||
| 		  (unsigned long long)__entry->orig_sector) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, | ||||
| 
 | ||||
| 	TP_PROTO(struct bio *bio, | ||||
| 		 sector_t orig_sector, | ||||
| 		 struct block_device *orig_bdev), | ||||
| 
 | ||||
| 	TP_ARGS(bio, orig_sector, orig_bdev) | ||||
| ); | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(bcache_gc, | ||||
| 
 | ||||
| 	TP_PROTO(uint8_t *uuid), | ||||
| 
 | ||||
| 	TP_ARGS(uuid), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(uint8_t *,	uuid) | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->uuid		= uuid; | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("%pU", __entry->uuid) | ||||
| ); | ||||
| 
 | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_gc, bcache_gc_start, | ||||
| 
 | ||||
| 	     TP_PROTO(uint8_t *uuid), | ||||
| 
 | ||||
| 	     TP_ARGS(uuid) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(bcache_gc, bcache_gc_end, | ||||
| 
 | ||||
| 	     TP_PROTO(uint8_t *uuid), | ||||
| 
 | ||||
| 	     TP_ARGS(uuid) | ||||
| ); | ||||
| 
 | ||||
| #endif /* _TRACE_BCACHE_H */ | ||||
| 
 | ||||
| /* This part must be outside protection */ | ||||
| #include <trace/define_trace.h> | ||||
|  | @ -1303,6 +1303,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 	p->memcg_batch.do_batch = 0; | ||||
| 	p->memcg_batch.memcg = NULL; | ||||
| #endif | ||||
| #ifdef CONFIG_BCACHE | ||||
| 	p->sequential_io	= 0; | ||||
| 	p->sequential_io_avg	= 0; | ||||
| #endif | ||||
| 
 | ||||
| 	/* Perform scheduler related setup. Assign this task to a CPU. */ | ||||
| 	sched_fork(p); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Kent Overstreet
				Kent Overstreet