This is a bad one. I wonder whether we were so far protected by no_free_segments(sb) usually being smaller than LOGFS_NO_AREAS. Found by Dan Carpenter <dan.carpenter@oracle.com> using smatch. Signed-off-by: Joern Engel <joern@logfs.org> Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
		
			
				
	
	
		
			732 lines
		
	
	
	
		
			20 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			732 lines
		
	
	
	
		
			20 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * fs/logfs/gc.c	- garbage collection code
 | 
						|
 *
 | 
						|
 * As should be obvious for Linux kernel code, license is GPLv2
 | 
						|
 *
 | 
						|
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 | 
						|
 */
 | 
						|
#include "logfs.h"
 | 
						|
#include <linux/sched.h>
 | 
						|
#include <linux/slab.h>
 | 
						|
 | 
						|
/*
 | 
						|
 * Wear leveling needs to kick in when the difference between low erase
 | 
						|
 * counts and high erase counts gets too big.  A good value for "too big"
 | 
						|
 * may be somewhat below 10% of maximum erase count for the device.
 | 
						|
 * Why not 397, to pick a nice round number with no specific meaning? :)
 | 
						|
 *
 | 
						|
 * WL_RATELIMIT is the minimum time between two wear level events.  A huge
 | 
						|
 * number of segments may fulfil the requirements for wear leveling at the
 | 
						|
 * same time.  If that happens we don't want to cause a latency from hell,
 | 
						|
 * but just gently pick one segment every so often and minimize overhead.
 | 
						|
 */
 | 
						|
#define WL_DELTA 397
 | 
						|
#define WL_RATELIMIT 100
 | 
						|
#define MAX_OBJ_ALIASES	2600
 | 
						|
#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
 | 
						|
#define LIST_SIZE 64	/* base size of candidate lists */
 | 
						|
#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
 | 
						|
#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
 | 
						|
 | 
						|
static int no_free_segments(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
 | 
						|
	return super->s_free_list.count;
 | 
						|
}
 | 
						|
 | 
						|
/* journal has distance -1, top-most ifile layer distance 0 */
 | 
						|
static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	u8 gc_level = (__force u8)__gc_level;
 | 
						|
 | 
						|
	switch (gc_level) {
 | 
						|
	case 0: /* fall through */
 | 
						|
	case 1: /* fall through */
 | 
						|
	case 2: /* fall through */
 | 
						|
	case 3:
 | 
						|
		/* file data or indirect blocks */
 | 
						|
		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
 | 
						|
	case 6: /* fall through */
 | 
						|
	case 7: /* fall through */
 | 
						|
	case 8: /* fall through */
 | 
						|
	case 9:
 | 
						|
		/* inode file data or indirect blocks */
 | 
						|
		return super->s_ifile_levels - (gc_level - 6);
 | 
						|
	default:
 | 
						|
		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
 | 
						|
				gc_level);
 | 
						|
		WARN_ON(1);
 | 
						|
		return super->s_ifile_levels + super->s_iblock_levels;
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static int segment_is_reserved(struct super_block *sb, u32 segno)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct logfs_area *area;
 | 
						|
	void *reserved;
 | 
						|
	int i;
 | 
						|
 | 
						|
	/* Some segments are reserved.  Just pretend they were all valid */
 | 
						|
	reserved = btree_lookup32(&super->s_reserved_segments, segno);
 | 
						|
	if (reserved)
 | 
						|
		return 1;
 | 
						|
 | 
						|
	/* Currently open segments */
 | 
						|
	for_each_area(i) {
 | 
						|
		area = super->s_area[i];
 | 
						|
		if (area->a_is_open && area->a_segno == segno)
 | 
						|
			return 1;
 | 
						|
	}
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
 | 
						|
{
 | 
						|
	BUG();
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Returns the bytes consumed by valid objects in this segment.  Object headers
 | 
						|
 * are counted, the segment header is not.
 | 
						|
 */
 | 
						|
static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
 | 
						|
		gc_level_t *gc_level)
 | 
						|
{
 | 
						|
	struct logfs_segment_entry se;
 | 
						|
	u32 ec_level;
 | 
						|
 | 
						|
	logfs_get_segment_entry(sb, segno, &se);
 | 
						|
	if (se.ec_level == cpu_to_be32(BADSEG) ||
 | 
						|
			se.valid == cpu_to_be32(RESERVED))
 | 
						|
		return RESERVED;
 | 
						|
 | 
						|
	ec_level = be32_to_cpu(se.ec_level);
 | 
						|
	*ec = ec_level >> 4;
 | 
						|
	*gc_level = GC_LEVEL(ec_level & 0xf);
 | 
						|
	return be32_to_cpu(se.valid);
 | 
						|
}
 | 
						|
 | 
						|
static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
 | 
						|
		u64 bix, gc_level_t gc_level)
 | 
						|
{
 | 
						|
	struct inode *inode;
 | 
						|
	int err, cookie;
 | 
						|
 | 
						|
	inode = logfs_safe_iget(sb, ino, &cookie);
 | 
						|
	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
 | 
						|
	BUG_ON(err);
 | 
						|
	logfs_safe_iput(inode, cookie);
 | 
						|
}
 | 
						|
 | 
						|
static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct logfs_segment_header sh;
 | 
						|
	struct logfs_object_header oh;
 | 
						|
	u64 ofs, ino, bix;
 | 
						|
	u32 seg_ofs, logical_segno, cleaned = 0;
 | 
						|
	int err, len, valid;
 | 
						|
	gc_level_t gc_level;
 | 
						|
 | 
						|
	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
 | 
						|
 | 
						|
	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
 | 
						|
	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
 | 
						|
	BUG_ON(err);
 | 
						|
	gc_level = GC_LEVEL(sh.level);
 | 
						|
	logical_segno = be32_to_cpu(sh.segno);
 | 
						|
	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
 | 
						|
		logfs_mark_segment_bad(sb, segno);
 | 
						|
		cleaned = -1;
 | 
						|
		goto out;
 | 
						|
	}
 | 
						|
 | 
						|
	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
 | 
						|
			seg_ofs + sizeof(oh) < super->s_segsize; ) {
 | 
						|
		ofs = dev_ofs(sb, logical_segno, seg_ofs);
 | 
						|
		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
 | 
						|
				&oh);
 | 
						|
		BUG_ON(err);
 | 
						|
 | 
						|
		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
 | 
						|
			break;
 | 
						|
 | 
						|
		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
 | 
						|
			logfs_mark_segment_bad(sb, segno);
 | 
						|
			cleaned = super->s_segsize - 1;
 | 
						|
			goto out;
 | 
						|
		}
 | 
						|
 | 
						|
		ino = be64_to_cpu(oh.ino);
 | 
						|
		bix = be64_to_cpu(oh.bix);
 | 
						|
		len = sizeof(oh) + be16_to_cpu(oh.len);
 | 
						|
		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
 | 
						|
		if (valid == 1) {
 | 
						|
			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
 | 
						|
			cleaned += len;
 | 
						|
		} else if (valid == 2) {
 | 
						|
			/* Will be invalid upon journal commit */
 | 
						|
			cleaned += len;
 | 
						|
		}
 | 
						|
		seg_ofs += len;
 | 
						|
	}
 | 
						|
out:
 | 
						|
	btree_remove32(&super->s_reserved_segments, segno);
 | 
						|
	return cleaned;
 | 
						|
}
 | 
						|
 | 
						|
static struct gc_candidate *add_list(struct gc_candidate *cand,
 | 
						|
		struct candidate_list *list)
 | 
						|
{
 | 
						|
	struct rb_node **p = &list->rb_tree.rb_node;
 | 
						|
	struct rb_node *parent = NULL;
 | 
						|
	struct gc_candidate *cur;
 | 
						|
	int comp;
 | 
						|
 | 
						|
	cand->list = list;
 | 
						|
	while (*p) {
 | 
						|
		parent = *p;
 | 
						|
		cur = rb_entry(parent, struct gc_candidate, rb_node);
 | 
						|
 | 
						|
		if (list->sort_by_ec)
 | 
						|
			comp = cand->erase_count < cur->erase_count;
 | 
						|
		else
 | 
						|
			comp = cand->valid < cur->valid;
 | 
						|
 | 
						|
		if (comp)
 | 
						|
			p = &parent->rb_left;
 | 
						|
		else
 | 
						|
			p = &parent->rb_right;
 | 
						|
	}
 | 
						|
	rb_link_node(&cand->rb_node, parent, p);
 | 
						|
	rb_insert_color(&cand->rb_node, &list->rb_tree);
 | 
						|
 | 
						|
	if (list->count <= list->maxcount) {
 | 
						|
		list->count++;
 | 
						|
		return NULL;
 | 
						|
	}
 | 
						|
	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
 | 
						|
	rb_erase(&cand->rb_node, &list->rb_tree);
 | 
						|
	cand->list = NULL;
 | 
						|
	return cand;
 | 
						|
}
 | 
						|
 | 
						|
static void remove_from_list(struct gc_candidate *cand)
 | 
						|
{
 | 
						|
	struct candidate_list *list = cand->list;
 | 
						|
 | 
						|
	rb_erase(&cand->rb_node, &list->rb_tree);
 | 
						|
	list->count--;
 | 
						|
}
 | 
						|
 | 
						|
static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
 | 
						|
	btree_remove32(&super->s_cand_tree, cand->segno);
 | 
						|
	kfree(cand);
 | 
						|
}
 | 
						|
 | 
						|
u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
 | 
						|
{
 | 
						|
	struct gc_candidate *cand;
 | 
						|
	u32 segno;
 | 
						|
 | 
						|
	BUG_ON(list->count == 0);
 | 
						|
 | 
						|
	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
 | 
						|
	remove_from_list(cand);
 | 
						|
	segno = cand->segno;
 | 
						|
	if (ec)
 | 
						|
		*ec = cand->erase_count;
 | 
						|
	free_candidate(sb, cand);
 | 
						|
	return segno;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * We have several lists to manage segments with.  The reserve_list is used to
 | 
						|
 * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
 | 
						|
 * list.
 | 
						|
 * The free_list contains free segments for normal usage.  It usually gets the
 | 
						|
 * second pick after the reserve_list.  But when the free_list is running short
 | 
						|
 * it is more important to keep the free_list full than to keep a reserve.
 | 
						|
 *
 | 
						|
 * Segments that are not free are put onto a per-level low_list.  If we have
 | 
						|
 * to run garbage collection, we pick a candidate from there.  All segments on
 | 
						|
 * those lists should have at least some free space so GC will make progress.
 | 
						|
 *
 | 
						|
 * And last we have the ec_list, which is used to pick segments for wear
 | 
						|
 * leveling.
 | 
						|
 *
 | 
						|
 * If all appropriate lists are full, we simply free the candidate and forget
 | 
						|
 * about that segment for a while.  We have better candidates for each purpose.
 | 
						|
 */
 | 
						|
static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
 | 
						|
 | 
						|
	if (cand->valid == 0) {
 | 
						|
		/* 100% free segments */
 | 
						|
		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
 | 
						|
				cand->segno, cand->erase_count,
 | 
						|
				dev_ofs(sb, cand->segno, 0));
 | 
						|
		cand = add_list(cand, &super->s_reserve_list);
 | 
						|
		if (cand) {
 | 
						|
			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
 | 
						|
					cand->segno, cand->erase_count,
 | 
						|
					dev_ofs(sb, cand->segno, 0));
 | 
						|
			cand = add_list(cand, &super->s_free_list);
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		/* good candidates for Garbage Collection */
 | 
						|
		if (cand->valid < full)
 | 
						|
			cand = add_list(cand, &super->s_low_list[cand->dist]);
 | 
						|
		/* good candidates for wear leveling,
 | 
						|
		 * segments that were recently written get ignored */
 | 
						|
		if (cand)
 | 
						|
			cand = add_list(cand, &super->s_ec_list);
 | 
						|
	}
 | 
						|
	if (cand)
 | 
						|
		free_candidate(sb, cand);
 | 
						|
}
 | 
						|
 | 
						|
static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
 | 
						|
		u8 dist)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct gc_candidate *cand;
 | 
						|
 | 
						|
	cand = kmalloc(sizeof(*cand), GFP_NOFS);
 | 
						|
	if (!cand)
 | 
						|
		return -ENOMEM;
 | 
						|
 | 
						|
	cand->segno = segno;
 | 
						|
	cand->valid = valid;
 | 
						|
	cand->erase_count = ec;
 | 
						|
	cand->dist = dist;
 | 
						|
 | 
						|
	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
 | 
						|
	__add_candidate(sb, cand);
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void remove_segment_from_lists(struct super_block *sb, u32 segno)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct gc_candidate *cand;
 | 
						|
 | 
						|
	cand = btree_lookup32(&super->s_cand_tree, segno);
 | 
						|
	if (cand) {
 | 
						|
		remove_from_list(cand);
 | 
						|
		free_candidate(sb, cand);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static void scan_segment(struct super_block *sb, u32 segno)
 | 
						|
{
 | 
						|
	u32 valid, ec = 0;
 | 
						|
	gc_level_t gc_level = 0;
 | 
						|
	u8 dist;
 | 
						|
 | 
						|
	if (segment_is_reserved(sb, segno))
 | 
						|
		return;
 | 
						|
 | 
						|
	remove_segment_from_lists(sb, segno);
 | 
						|
	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
 | 
						|
	if (valid == RESERVED)
 | 
						|
		return;
 | 
						|
 | 
						|
	dist = root_distance(sb, gc_level);
 | 
						|
	add_candidate(sb, segno, valid, ec, dist);
 | 
						|
}
 | 
						|
 | 
						|
static struct gc_candidate *first_in_list(struct candidate_list *list)
 | 
						|
{
 | 
						|
	if (list->count == 0)
 | 
						|
		return NULL;
 | 
						|
	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Find the best segment for garbage collection.  Main criterion is
 | 
						|
 * the segment requiring the least effort to clean.  Secondary
 | 
						|
 * criterion is to GC on the lowest level available.
 | 
						|
 *
 | 
						|
 * So we search the least effort segment on the lowest level first,
 | 
						|
 * then move up and pick another segment iff is requires significantly
 | 
						|
 * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
 | 
						|
 */
 | 
						|
static struct gc_candidate *get_candidate(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	int i, max_dist;
 | 
						|
	struct gc_candidate *cand = NULL, *this;
 | 
						|
 | 
						|
	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
 | 
						|
 | 
						|
	for (i = max_dist; i >= 0; i--) {
 | 
						|
		this = first_in_list(&super->s_low_list[i]);
 | 
						|
		if (!this)
 | 
						|
			continue;
 | 
						|
		if (!cand)
 | 
						|
			cand = this;
 | 
						|
		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
 | 
						|
			cand = this;
 | 
						|
	}
 | 
						|
	return cand;
 | 
						|
}
 | 
						|
 | 
						|
static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	gc_level_t gc_level;
 | 
						|
	u32 cleaned, valid, segno, ec;
 | 
						|
	u8 dist;
 | 
						|
 | 
						|
	if (!cand) {
 | 
						|
		log_gc("GC attempted, but no candidate found\n");
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	segno = cand->segno;
 | 
						|
	dist = cand->dist;
 | 
						|
	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
 | 
						|
	free_candidate(sb, cand);
 | 
						|
	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
 | 
						|
			segno, (u64)segno << super->s_segshift,
 | 
						|
			dist, no_free_segments(sb), valid,
 | 
						|
			super->s_free_bytes);
 | 
						|
	cleaned = logfs_gc_segment(sb, segno);
 | 
						|
	log_gc("GC segment #%02x complete - now %x valid\n", segno,
 | 
						|
			valid - cleaned);
 | 
						|
	BUG_ON(cleaned != valid);
 | 
						|
	return 1;
 | 
						|
}
 | 
						|
 | 
						|
static int logfs_gc_once(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct gc_candidate *cand;
 | 
						|
 | 
						|
	cand = get_candidate(sb);
 | 
						|
	if (cand)
 | 
						|
		remove_from_list(cand);
 | 
						|
	return __logfs_gc_once(sb, cand);
 | 
						|
}
 | 
						|
 | 
						|
/* returns 1 if a wrap occurs, 0 otherwise */
 | 
						|
static int logfs_scan_some(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	u32 segno;
 | 
						|
	int i, ret = 0;
 | 
						|
 | 
						|
	segno = super->s_sweeper;
 | 
						|
	for (i = SCAN_RATIO; i > 0; i--) {
 | 
						|
		segno++;
 | 
						|
		if (segno >= super->s_no_segs) {
 | 
						|
			segno = 0;
 | 
						|
			ret = 1;
 | 
						|
			/* Break out of the loop.  We want to read a single
 | 
						|
			 * block from the segment size on next invocation if
 | 
						|
			 * SCAN_RATIO is set to match block size
 | 
						|
			 */
 | 
						|
			break;
 | 
						|
		}
 | 
						|
 | 
						|
		scan_segment(sb, segno);
 | 
						|
	}
 | 
						|
	super->s_sweeper = segno;
 | 
						|
	return ret;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * In principle, this function should loop forever, looking for GC candidates
 | 
						|
 * and moving data.  LogFS is designed in such a way that this loop is
 | 
						|
 * guaranteed to terminate.
 | 
						|
 *
 | 
						|
 * Limiting the loop to some iterations serves purely to catch cases when
 | 
						|
 * these guarantees have failed.  An actual endless loop is an obvious bug
 | 
						|
 * and should be reported as such.
 | 
						|
 */
 | 
						|
static void __logfs_gc_pass(struct super_block *sb, int target)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct logfs_block *block;
 | 
						|
	int round, progress, last_progress = 0;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Doing too many changes to the segfile at once would result
 | 
						|
	 * in a large number of aliases.  Write the journal before
 | 
						|
	 * things get out of hand.
 | 
						|
	 */
 | 
						|
	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
 | 
						|
		logfs_write_anchor(sb);
 | 
						|
 | 
						|
	if (no_free_segments(sb) >= target &&
 | 
						|
			super->s_no_object_aliases < MAX_OBJ_ALIASES)
 | 
						|
		return;
 | 
						|
 | 
						|
	log_gc("__logfs_gc_pass(%x)\n", target);
 | 
						|
	for (round = 0; round < SCAN_ROUNDS; ) {
 | 
						|
		if (no_free_segments(sb) >= target)
 | 
						|
			goto write_alias;
 | 
						|
 | 
						|
		/* Sync in-memory state with on-medium state in case they
 | 
						|
		 * diverged */
 | 
						|
		logfs_write_anchor(sb);
 | 
						|
		round += logfs_scan_some(sb);
 | 
						|
		if (no_free_segments(sb) >= target)
 | 
						|
			goto write_alias;
 | 
						|
		progress = logfs_gc_once(sb);
 | 
						|
		if (progress)
 | 
						|
			last_progress = round;
 | 
						|
		else if (round - last_progress > 2)
 | 
						|
			break;
 | 
						|
		continue;
 | 
						|
 | 
						|
		/*
 | 
						|
		 * The goto logic is nasty, I just don't know a better way to
 | 
						|
		 * code it.  GC is supposed to ensure two things:
 | 
						|
		 * 1. Enough free segments are available.
 | 
						|
		 * 2. The number of aliases is bounded.
 | 
						|
		 * When 1. is achieved, we take a look at 2. and write back
 | 
						|
		 * some alias-containing blocks, if necessary.  However, after
 | 
						|
		 * each such write we need to go back to 1., as writes can
 | 
						|
		 * consume free segments.
 | 
						|
		 */
 | 
						|
write_alias:
 | 
						|
		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
 | 
						|
			return;
 | 
						|
		if (list_empty(&super->s_object_alias)) {
 | 
						|
			/* All aliases are still in btree */
 | 
						|
			return;
 | 
						|
		}
 | 
						|
		log_gc("Write back one alias\n");
 | 
						|
		block = list_entry(super->s_object_alias.next,
 | 
						|
				struct logfs_block, alias_list);
 | 
						|
		block->ops->write_block(block);
 | 
						|
		/*
 | 
						|
		 * To round off the nasty goto logic, we reset round here.  It
 | 
						|
		 * is a safety-net for GC not making any progress and limited
 | 
						|
		 * to something reasonably small.  If incremented it for every
 | 
						|
		 * single alias, the loop could terminate rather quickly.
 | 
						|
		 */
 | 
						|
		round = 0;
 | 
						|
	}
 | 
						|
	LOGFS_BUG(sb);
 | 
						|
}
 | 
						|
 | 
						|
static int wl_ratelimit(struct super_block *sb, u64 *next_event)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
 | 
						|
	if (*next_event < super->s_gec) {
 | 
						|
		*next_event = super->s_gec + WL_RATELIMIT;
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
	return 1;
 | 
						|
}
 | 
						|
 | 
						|
static void logfs_wl_pass(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct gc_candidate *wl_cand, *free_cand;
 | 
						|
 | 
						|
	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
 | 
						|
		return;
 | 
						|
 | 
						|
	wl_cand = first_in_list(&super->s_ec_list);
 | 
						|
	if (!wl_cand)
 | 
						|
		return;
 | 
						|
	free_cand = first_in_list(&super->s_free_list);
 | 
						|
	if (!free_cand)
 | 
						|
		return;
 | 
						|
 | 
						|
	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
 | 
						|
		remove_from_list(wl_cand);
 | 
						|
		__logfs_gc_once(sb, wl_cand);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * The journal needs wear leveling as well.  But moving the journal is an
 | 
						|
 * expensive operation so we try to avoid it as much as possible.  And if we
 | 
						|
 * have to do it, we move the whole journal, not individual segments.
 | 
						|
 *
 | 
						|
 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
 | 
						|
 * calculations.  First we check whether moving the journal would be a
 | 
						|
 * significant improvement.  That means that a) the current journal segments
 | 
						|
 * have more wear than the future journal segments and b) the current journal
 | 
						|
 * segments have more wear than normal ostore segments.
 | 
						|
 * Rationale for b) is that we don't have to move the journal if it is aging
 | 
						|
 * less than the ostore, even if the reserve segments age even less (they are
 | 
						|
 * excluded from wear leveling, after all).
 | 
						|
 * Next we check that the superblocks have less wear than the journal.  Since
 | 
						|
 * moving the journal requires writing the superblocks, we have to protect the
 | 
						|
 * superblocks even more than the journal.
 | 
						|
 *
 | 
						|
 * Also we double the acceptable wear difference, compared to ostore wear
 | 
						|
 * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
 | 
						|
 * soft errors have much less time to accumulate and we allow the journal to
 | 
						|
 * be a bit worse than the ostore.
 | 
						|
 */
 | 
						|
static void logfs_journal_wl_pass(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct gc_candidate *cand;
 | 
						|
	u32 min_journal_ec = -1, max_reserve_ec = 0;
 | 
						|
	int i;
 | 
						|
 | 
						|
	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
 | 
						|
		return;
 | 
						|
 | 
						|
	if (super->s_reserve_list.count < super->s_no_journal_segs) {
 | 
						|
		/* Reserve is not full enough to move complete journal */
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	journal_for_each(i)
 | 
						|
		if (super->s_journal_seg[i])
 | 
						|
			min_journal_ec = min(min_journal_ec,
 | 
						|
					super->s_journal_ec[i]);
 | 
						|
	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
 | 
						|
			struct gc_candidate, rb_node);
 | 
						|
	max_reserve_ec = cand->erase_count;
 | 
						|
	for (i = 0; i < 2; i++) {
 | 
						|
		struct logfs_segment_entry se;
 | 
						|
		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
 | 
						|
		u32 ec;
 | 
						|
 | 
						|
		logfs_get_segment_entry(sb, segno, &se);
 | 
						|
		ec = be32_to_cpu(se.ec_level) >> 4;
 | 
						|
		max_reserve_ec = max(max_reserve_ec, ec);
 | 
						|
	}
 | 
						|
 | 
						|
	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
 | 
						|
		do_logfs_journal_wl_pass(sb);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
void logfs_gc_pass(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
 | 
						|
	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
 | 
						|
	/* Write journal before free space is getting saturated with dirty
 | 
						|
	 * objects.
 | 
						|
	 */
 | 
						|
	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
 | 
						|
			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
 | 
						|
		logfs_write_anchor(sb);
 | 
						|
	__logfs_gc_pass(sb, super->s_total_levels);
 | 
						|
	logfs_wl_pass(sb);
 | 
						|
	logfs_journal_wl_pass(sb);
 | 
						|
}
 | 
						|
 | 
						|
static int check_area(struct super_block *sb, int i)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	struct logfs_area *area = super->s_area[i];
 | 
						|
	gc_level_t gc_level;
 | 
						|
	u32 cleaned, valid, ec;
 | 
						|
	u32 segno = area->a_segno;
 | 
						|
	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
 | 
						|
 | 
						|
	if (!area->a_is_open)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	if (super->s_devops->can_write_buf(sb, ofs) == 0)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
 | 
						|
	/*
 | 
						|
	 * The device cannot write back the write buffer.  Most likely the
 | 
						|
	 * wbuf was already written out and the system crashed at some point
 | 
						|
	 * before the journal commit happened.  In that case we wouldn't have
 | 
						|
	 * to do anything.  But if the crash happened before the wbuf was
 | 
						|
	 * written out correctly, we must GC this segment.  So assume the
 | 
						|
	 * worst and always do the GC run.
 | 
						|
	 */
 | 
						|
	area->a_is_open = 0;
 | 
						|
	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
 | 
						|
	cleaned = logfs_gc_segment(sb, segno);
 | 
						|
	if (cleaned != valid)
 | 
						|
		return -EIO;
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
int logfs_check_areas(struct super_block *sb)
 | 
						|
{
 | 
						|
	int i, err;
 | 
						|
 | 
						|
	for_each_area(i) {
 | 
						|
		err = check_area(sb, i);
 | 
						|
		if (err)
 | 
						|
			return err;
 | 
						|
	}
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void logfs_init_candlist(struct candidate_list *list, int maxcount,
 | 
						|
		int sort_by_ec)
 | 
						|
{
 | 
						|
	list->count = 0;
 | 
						|
	list->maxcount = maxcount;
 | 
						|
	list->sort_by_ec = sort_by_ec;
 | 
						|
	list->rb_tree = RB_ROOT;
 | 
						|
}
 | 
						|
 | 
						|
int logfs_init_gc(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	int i;
 | 
						|
 | 
						|
	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
 | 
						|
	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
 | 
						|
	logfs_init_candlist(&super->s_reserve_list,
 | 
						|
			super->s_bad_seg_reserve, 1);
 | 
						|
	for_each_area(i)
 | 
						|
		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
 | 
						|
	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void logfs_cleanup_list(struct super_block *sb,
 | 
						|
		struct candidate_list *list)
 | 
						|
{
 | 
						|
	struct gc_candidate *cand;
 | 
						|
 | 
						|
	while (list->count) {
 | 
						|
		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
 | 
						|
				rb_node);
 | 
						|
		remove_from_list(cand);
 | 
						|
		free_candidate(sb, cand);
 | 
						|
	}
 | 
						|
	BUG_ON(list->rb_tree.rb_node);
 | 
						|
}
 | 
						|
 | 
						|
void logfs_cleanup_gc(struct super_block *sb)
 | 
						|
{
 | 
						|
	struct logfs_super *super = logfs_super(sb);
 | 
						|
	int i;
 | 
						|
 | 
						|
	if (!super->s_free_list.count)
 | 
						|
		return;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * FIXME: The btree may still contain a single empty node.  So we
 | 
						|
	 * call the grim visitor to clean up that mess.  Btree code should
 | 
						|
	 * do it for us, really.
 | 
						|
	 */
 | 
						|
	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
 | 
						|
	logfs_cleanup_list(sb, &super->s_free_list);
 | 
						|
	logfs_cleanup_list(sb, &super->s_reserve_list);
 | 
						|
	for_each_area(i)
 | 
						|
		logfs_cleanup_list(sb, &super->s_low_list[i]);
 | 
						|
	logfs_cleanup_list(sb, &super->s_ec_list);
 | 
						|
}
 |