This adds support for the sys_splice system call. Using a pipe as a transport, it can connect to files or sockets (latter as output only). From the splice.c comments: "splice": joining two ropes together by interweaving their strands. This is the "extended pipe" functionality, where a pipe is used as an arbitrary in-memory buffer. Think of a pipe as a small kernel buffer that you can use to transfer data from one end to the other. The traditional unix read/write is extended with a "splice()" operation that transfers data buffers to or from a pipe buffer. Named by Larry McVoy, original implementation from Linus, extended by Jens to support splicing to files and fixing the initial implementation bugs. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
		
			
				
	
	
		
			1591 lines
		
	
	
	
		
			53 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1591 lines
		
	
	
	
		
			53 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/time.h>
 | 
						|
#include <linux/reiserfs_fs.h>
 | 
						|
#include <linux/reiserfs_acl.h>
 | 
						|
#include <linux/reiserfs_xattr.h>
 | 
						|
#include <linux/smp_lock.h>
 | 
						|
#include <asm/uaccess.h>
 | 
						|
#include <linux/pagemap.h>
 | 
						|
#include <linux/swap.h>
 | 
						|
#include <linux/writeback.h>
 | 
						|
#include <linux/blkdev.h>
 | 
						|
#include <linux/buffer_head.h>
 | 
						|
#include <linux/quotaops.h>
 | 
						|
 | 
						|
/*
 | 
						|
** We pack the tails of files on file close, not at the time they are written.
 | 
						|
** This implies an unnecessary copy of the tail and an unnecessary indirect item
 | 
						|
** insertion/balancing, for files that are written in one write.
 | 
						|
** It avoids unnecessary tail packings (balances) for files that are written in
 | 
						|
** multiple writes and are small enough to have tails.
 | 
						|
** 
 | 
						|
** file_release is called by the VFS layer when the file is closed.  If
 | 
						|
** this is the last open file descriptor, and the file
 | 
						|
** small enough to have a tail, and the tail is currently in an
 | 
						|
** unformatted node, the tail is converted back into a direct item.
 | 
						|
** 
 | 
						|
** We use reiserfs_truncate_file to pack the tail, since it already has
 | 
						|
** all the conditions coded.  
 | 
						|
*/
 | 
						|
static int reiserfs_file_release(struct inode *inode, struct file *filp)
 | 
						|
{
 | 
						|
 | 
						|
	struct reiserfs_transaction_handle th;
 | 
						|
	int err;
 | 
						|
	int jbegin_failure = 0;
 | 
						|
 | 
						|
	if (!S_ISREG(inode->i_mode))
 | 
						|
		BUG();
 | 
						|
 | 
						|
	/* fast out for when nothing needs to be done */
 | 
						|
	if ((atomic_read(&inode->i_count) > 1 ||
 | 
						|
	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
 | 
						|
	     !tail_has_to_be_packed(inode)) &&
 | 
						|
	    REISERFS_I(inode)->i_prealloc_count <= 0) {
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	reiserfs_write_lock(inode->i_sb);
 | 
						|
	mutex_lock(&inode->i_mutex);
 | 
						|
	/* freeing preallocation only involves relogging blocks that
 | 
						|
	 * are already in the current transaction.  preallocation gets
 | 
						|
	 * freed at the end of each transaction, so it is impossible for
 | 
						|
	 * us to log any additional blocks (including quota blocks)
 | 
						|
	 */
 | 
						|
	err = journal_begin(&th, inode->i_sb, 1);
 | 
						|
	if (err) {
 | 
						|
		/* uh oh, we can't allow the inode to go away while there
 | 
						|
		 * is still preallocation blocks pending.  Try to join the
 | 
						|
		 * aborted transaction
 | 
						|
		 */
 | 
						|
		jbegin_failure = err;
 | 
						|
		err = journal_join_abort(&th, inode->i_sb, 1);
 | 
						|
 | 
						|
		if (err) {
 | 
						|
			/* hmpf, our choices here aren't good.  We can pin the inode
 | 
						|
			 * which will disallow unmount from every happening, we can
 | 
						|
			 * do nothing, which will corrupt random memory on unmount,
 | 
						|
			 * or we can forcibly remove the file from the preallocation
 | 
						|
			 * list, which will leak blocks on disk.  Lets pin the inode
 | 
						|
			 * and let the admin know what is going on.
 | 
						|
			 */
 | 
						|
			igrab(inode);
 | 
						|
			reiserfs_warning(inode->i_sb,
 | 
						|
					 "pinning inode %lu because the "
 | 
						|
					 "preallocation can't be freed");
 | 
						|
			goto out;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	reiserfs_update_inode_transaction(inode);
 | 
						|
 | 
						|
#ifdef REISERFS_PREALLOCATE
 | 
						|
	reiserfs_discard_prealloc(&th, inode);
 | 
						|
#endif
 | 
						|
	err = journal_end(&th, inode->i_sb, 1);
 | 
						|
 | 
						|
	/* copy back the error code from journal_begin */
 | 
						|
	if (!err)
 | 
						|
		err = jbegin_failure;
 | 
						|
 | 
						|
	if (!err && atomic_read(&inode->i_count) <= 1 &&
 | 
						|
	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
 | 
						|
	    tail_has_to_be_packed(inode)) {
 | 
						|
		/* if regular file is released by last holder and it has been
 | 
						|
		   appended (we append by unformatted node only) or its direct
 | 
						|
		   item(s) had to be converted, then it may have to be
 | 
						|
		   indirect2direct converted */
 | 
						|
		err = reiserfs_truncate_file(inode, 0);
 | 
						|
	}
 | 
						|
      out:
 | 
						|
	mutex_unlock(&inode->i_mutex);
 | 
						|
	reiserfs_write_unlock(inode->i_sb);
 | 
						|
	return err;
 | 
						|
}
 | 
						|
 | 
						|
static void reiserfs_vfs_truncate_file(struct inode *inode)
 | 
						|
{
 | 
						|
	reiserfs_truncate_file(inode, 1);
 | 
						|
}
 | 
						|
 | 
						|
/* Sync a reiserfs file. */
 | 
						|
 | 
						|
/*
 | 
						|
 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
 | 
						|
 * be removed...
 | 
						|
 */
 | 
						|
 | 
						|
static int reiserfs_sync_file(struct file *p_s_filp,
 | 
						|
			      struct dentry *p_s_dentry, int datasync)
 | 
						|
{
 | 
						|
	struct inode *p_s_inode = p_s_dentry->d_inode;
 | 
						|
	int n_err;
 | 
						|
	int barrier_done;
 | 
						|
 | 
						|
	if (!S_ISREG(p_s_inode->i_mode))
 | 
						|
		BUG();
 | 
						|
	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
 | 
						|
	reiserfs_write_lock(p_s_inode->i_sb);
 | 
						|
	barrier_done = reiserfs_commit_for_inode(p_s_inode);
 | 
						|
	reiserfs_write_unlock(p_s_inode->i_sb);
 | 
						|
	if (barrier_done != 1)
 | 
						|
		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
 | 
						|
	if (barrier_done < 0)
 | 
						|
		return barrier_done;
 | 
						|
	return (n_err < 0) ? -EIO : 0;
 | 
						|
}
 | 
						|
 | 
						|
/* I really do not want to play with memory shortage right now, so
 | 
						|
   to simplify the code, we are not going to write more than this much pages at
 | 
						|
   a time. This still should considerably improve performance compared to 4k
 | 
						|
   at a time case. This is 32 pages of 4k size. */
 | 
						|
#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
 | 
						|
 | 
						|
/* Allocates blocks for a file to fulfil write request.
 | 
						|
   Maps all unmapped but prepared pages from the list.
 | 
						|
   Updates metadata with newly allocated blocknumbers as needed */
 | 
						|
static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
 | 
						|
					       loff_t pos,	/* Writing position */
 | 
						|
					       int num_pages,	/* number of pages write going
 | 
						|
								   to touch */
 | 
						|
					       int write_bytes,	/* amount of bytes to write */
 | 
						|
					       struct page **prepared_pages,	/* array of
 | 
						|
										   prepared pages
 | 
						|
										 */
 | 
						|
					       int blocks_to_allocate	/* Amount of blocks we
 | 
						|
									   need to allocate to
 | 
						|
									   fit the data into file
 | 
						|
									 */
 | 
						|
    )
 | 
						|
{
 | 
						|
	struct cpu_key key;	// cpu key of item that we are going to deal with
 | 
						|
	struct item_head *ih;	// pointer to item head that we are going to deal with
 | 
						|
	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
 | 
						|
	__le32 *item;		// pointer to item we are going to deal with
 | 
						|
	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
 | 
						|
	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
 | 
						|
	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
 | 
						|
	size_t res;		// return value of various functions that we call.
 | 
						|
	int curr_block;		// current block used to keep track of unmapped blocks.
 | 
						|
	int i;			// loop counter
 | 
						|
	int itempos;		// position in item
 | 
						|
	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
 | 
						|
	// first page
 | 
						|
	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
 | 
						|
	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
 | 
						|
	int modifying_this_item = 0;	// Flag for items traversal code to keep track
 | 
						|
	// of the fact that we already prepared
 | 
						|
	// current block for journal
 | 
						|
	int will_prealloc = 0;
 | 
						|
	RFALSE(!blocks_to_allocate,
 | 
						|
	       "green-9004: tried to allocate zero blocks?");
 | 
						|
 | 
						|
	/* only preallocate if this is a small write */
 | 
						|
	if (REISERFS_I(inode)->i_prealloc_count ||
 | 
						|
	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
 | 
						|
	     blocks_to_allocate <
 | 
						|
	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
 | 
						|
		will_prealloc =
 | 
						|
		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
 | 
						|
 | 
						|
	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
 | 
						|
				   sizeof(b_blocknr_t), GFP_NOFS);
 | 
						|
	if (!allocated_blocks)
 | 
						|
		return -ENOMEM;
 | 
						|
 | 
						|
	/* First we compose a key to point at the writing position, we want to do
 | 
						|
	   that outside of any locking region. */
 | 
						|
	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
 | 
						|
 | 
						|
	/* If we came here, it means we absolutely need to open a transaction,
 | 
						|
	   since we need to allocate some blocks */
 | 
						|
	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
 | 
						|
	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
 | 
						|
	if (res)
 | 
						|
		goto error_exit;
 | 
						|
	reiserfs_update_inode_transaction(inode);
 | 
						|
 | 
						|
	/* Look for the in-tree position of our write, need path for block allocator */
 | 
						|
	res = search_for_position_by_key(inode->i_sb, &key, &path);
 | 
						|
	if (res == IO_ERROR) {
 | 
						|
		res = -EIO;
 | 
						|
		goto error_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Allocate blocks */
 | 
						|
	/* First fill in "hint" structure for block allocator */
 | 
						|
	hint.th = th;		// transaction handle.
 | 
						|
	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
 | 
						|
	hint.inode = inode;	// Inode is needed by block allocator too.
 | 
						|
	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
 | 
						|
	hint.key = key.on_disk_key;	// on disk key of file.
 | 
						|
	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
 | 
						|
	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
 | 
						|
	hint.preallocate = will_prealloc;
 | 
						|
 | 
						|
	/* Call block allocator to allocate blocks */
 | 
						|
	res =
 | 
						|
	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
 | 
						|
				       blocks_to_allocate, blocks_to_allocate);
 | 
						|
	if (res != CARRY_ON) {
 | 
						|
		if (res == NO_DISK_SPACE) {
 | 
						|
			/* We flush the transaction in case of no space. This way some
 | 
						|
			   blocks might become free */
 | 
						|
			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
 | 
						|
			res = restart_transaction(th, inode, &path);
 | 
						|
			if (res)
 | 
						|
				goto error_exit;
 | 
						|
 | 
						|
			/* We might have scheduled, so search again */
 | 
						|
			res =
 | 
						|
			    search_for_position_by_key(inode->i_sb, &key,
 | 
						|
						       &path);
 | 
						|
			if (res == IO_ERROR) {
 | 
						|
				res = -EIO;
 | 
						|
				goto error_exit;
 | 
						|
			}
 | 
						|
 | 
						|
			/* update changed info for hint structure. */
 | 
						|
			res =
 | 
						|
			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
 | 
						|
						       blocks_to_allocate,
 | 
						|
						       blocks_to_allocate);
 | 
						|
			if (res != CARRY_ON) {
 | 
						|
				res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
 | 
						|
				pathrelse(&path);
 | 
						|
				goto error_exit;
 | 
						|
			}
 | 
						|
		} else {
 | 
						|
			res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
 | 
						|
			pathrelse(&path);
 | 
						|
			goto error_exit;
 | 
						|
		}
 | 
						|
	}
 | 
						|
#ifdef __BIG_ENDIAN
 | 
						|
	// Too bad, I have not found any way to convert a given region from
 | 
						|
	// cpu format to little endian format
 | 
						|
	{
 | 
						|
		int i;
 | 
						|
		for (i = 0; i < blocks_to_allocate; i++)
 | 
						|
			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
 | 
						|
	}
 | 
						|
#endif
 | 
						|
 | 
						|
	/* Blocks allocating well might have scheduled and tree might have changed,
 | 
						|
	   let's search the tree again */
 | 
						|
	/* find where in the tree our write should go */
 | 
						|
	res = search_for_position_by_key(inode->i_sb, &key, &path);
 | 
						|
	if (res == IO_ERROR) {
 | 
						|
		res = -EIO;
 | 
						|
		goto error_exit_free_blocks;
 | 
						|
	}
 | 
						|
 | 
						|
	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
 | 
						|
	ih = get_ih(&path);	// Get a pointer to last item head in path.
 | 
						|
	item = get_item(&path);	// Get a pointer to last item in path
 | 
						|
 | 
						|
	/* Let's see what we have found */
 | 
						|
	if (res != POSITION_FOUND) {	/* position not found, this means that we
 | 
						|
					   might need to append file with holes
 | 
						|
					   first */
 | 
						|
		// Since we are writing past the file's end, we need to find out if
 | 
						|
		// there is a hole that needs to be inserted before our writing
 | 
						|
		// position, and how many blocks it is going to cover (we need to
 | 
						|
		//  populate pointers to file blocks representing the hole with zeros)
 | 
						|
 | 
						|
		{
 | 
						|
			int item_offset = 1;
 | 
						|
			/*
 | 
						|
			 * if ih is stat data, its offset is 0 and we don't want to
 | 
						|
			 * add 1 to pos in the hole_size calculation
 | 
						|
			 */
 | 
						|
			if (is_statdata_le_ih(ih))
 | 
						|
				item_offset = 0;
 | 
						|
			hole_size = (pos + item_offset -
 | 
						|
				     (le_key_k_offset
 | 
						|
				      (get_inode_item_key_version(inode),
 | 
						|
				       &(ih->ih_key)) + op_bytes_number(ih,
 | 
						|
									inode->
 | 
						|
									i_sb->
 | 
						|
									s_blocksize)))
 | 
						|
			    >> inode->i_sb->s_blocksize_bits;
 | 
						|
		}
 | 
						|
 | 
						|
		if (hole_size > 0) {
 | 
						|
			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
 | 
						|
			/* area filled with zeroes, to supply as list of zero blocknumbers
 | 
						|
			   We allocate it outside of loop just in case loop would spin for
 | 
						|
			   several iterations. */
 | 
						|
			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
 | 
						|
			if (!zeros) {
 | 
						|
				res = -ENOMEM;
 | 
						|
				goto error_exit_free_blocks;
 | 
						|
			}
 | 
						|
			memset(zeros, 0, to_paste * UNFM_P_SIZE);
 | 
						|
			do {
 | 
						|
				to_paste =
 | 
						|
				    min_t(__u64, hole_size,
 | 
						|
					  MAX_ITEM_LEN(inode->i_sb->
 | 
						|
						       s_blocksize) /
 | 
						|
					  UNFM_P_SIZE);
 | 
						|
				if (is_indirect_le_ih(ih)) {
 | 
						|
					/* Ok, there is existing indirect item already. Need to append it */
 | 
						|
					/* Calculate position past inserted item */
 | 
						|
					make_cpu_key(&key, inode,
 | 
						|
						     le_key_k_offset
 | 
						|
						     (get_inode_item_key_version
 | 
						|
						      (inode),
 | 
						|
						      &(ih->ih_key)) +
 | 
						|
						     op_bytes_number(ih,
 | 
						|
								     inode->
 | 
						|
								     i_sb->
 | 
						|
								     s_blocksize),
 | 
						|
						     TYPE_INDIRECT, 3);
 | 
						|
					res =
 | 
						|
					    reiserfs_paste_into_item(th, &path,
 | 
						|
								     &key,
 | 
						|
								     inode,
 | 
						|
								     (char *)
 | 
						|
								     zeros,
 | 
						|
								     UNFM_P_SIZE
 | 
						|
								     *
 | 
						|
								     to_paste);
 | 
						|
					if (res) {
 | 
						|
						kfree(zeros);
 | 
						|
						goto error_exit_free_blocks;
 | 
						|
					}
 | 
						|
				} else if (is_statdata_le_ih(ih)) {
 | 
						|
					/* No existing item, create it */
 | 
						|
					/* item head for new item */
 | 
						|
					struct item_head ins_ih;
 | 
						|
 | 
						|
					/* create a key for our new item */
 | 
						|
					make_cpu_key(&key, inode, 1,
 | 
						|
						     TYPE_INDIRECT, 3);
 | 
						|
 | 
						|
					/* Create new item head for our new item */
 | 
						|
					make_le_item_head(&ins_ih, &key,
 | 
						|
							  key.version, 1,
 | 
						|
							  TYPE_INDIRECT,
 | 
						|
							  to_paste *
 | 
						|
							  UNFM_P_SIZE,
 | 
						|
							  0 /* free space */ );
 | 
						|
 | 
						|
					/* Find where such item should live in the tree */
 | 
						|
					res =
 | 
						|
					    search_item(inode->i_sb, &key,
 | 
						|
							&path);
 | 
						|
					if (res != ITEM_NOT_FOUND) {
 | 
						|
						/* item should not exist, otherwise we have error */
 | 
						|
						if (res != -ENOSPC) {
 | 
						|
							reiserfs_warning(inode->
 | 
						|
									 i_sb,
 | 
						|
									 "green-9008: search_by_key (%K) returned %d",
 | 
						|
									 &key,
 | 
						|
									 res);
 | 
						|
						}
 | 
						|
						res = -EIO;
 | 
						|
						kfree(zeros);
 | 
						|
						goto error_exit_free_blocks;
 | 
						|
					}
 | 
						|
					res =
 | 
						|
					    reiserfs_insert_item(th, &path,
 | 
						|
								 &key, &ins_ih,
 | 
						|
								 inode,
 | 
						|
								 (char *)zeros);
 | 
						|
				} else {
 | 
						|
					reiserfs_panic(inode->i_sb,
 | 
						|
						       "green-9011: Unexpected key type %K\n",
 | 
						|
						       &key);
 | 
						|
				}
 | 
						|
				if (res) {
 | 
						|
					kfree(zeros);
 | 
						|
					goto error_exit_free_blocks;
 | 
						|
				}
 | 
						|
				/* Now we want to check if transaction is too full, and if it is
 | 
						|
				   we restart it. This will also free the path. */
 | 
						|
				if (journal_transaction_should_end
 | 
						|
				    (th, th->t_blocks_allocated)) {
 | 
						|
					res =
 | 
						|
					    restart_transaction(th, inode,
 | 
						|
								&path);
 | 
						|
					if (res) {
 | 
						|
						pathrelse(&path);
 | 
						|
						kfree(zeros);
 | 
						|
						goto error_exit;
 | 
						|
					}
 | 
						|
				}
 | 
						|
 | 
						|
				/* Well, need to recalculate path and stuff */
 | 
						|
				set_cpu_key_k_offset(&key,
 | 
						|
						     cpu_key_k_offset(&key) +
 | 
						|
						     (to_paste << inode->
 | 
						|
						      i_blkbits));
 | 
						|
				res =
 | 
						|
				    search_for_position_by_key(inode->i_sb,
 | 
						|
							       &key, &path);
 | 
						|
				if (res == IO_ERROR) {
 | 
						|
					res = -EIO;
 | 
						|
					kfree(zeros);
 | 
						|
					goto error_exit_free_blocks;
 | 
						|
				}
 | 
						|
				bh = get_last_bh(&path);
 | 
						|
				ih = get_ih(&path);
 | 
						|
				item = get_item(&path);
 | 
						|
				hole_size -= to_paste;
 | 
						|
			} while (hole_size);
 | 
						|
			kfree(zeros);
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// Go through existing indirect items first
 | 
						|
	// replace all zeroes with blocknumbers from list
 | 
						|
	// Note that if no corresponding item was found, by previous search,
 | 
						|
	// it means there are no existing in-tree representation for file area
 | 
						|
	// we are going to overwrite, so there is nothing to scan through for holes.
 | 
						|
	for (curr_block = 0, itempos = path.pos_in_item;
 | 
						|
	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
 | 
						|
	      retry:
 | 
						|
 | 
						|
		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
 | 
						|
			/* We run out of data in this indirect item, let's look for another
 | 
						|
			   one. */
 | 
						|
			/* First if we are already modifying current item, log it */
 | 
						|
			if (modifying_this_item) {
 | 
						|
				journal_mark_dirty(th, inode->i_sb, bh);
 | 
						|
				modifying_this_item = 0;
 | 
						|
			}
 | 
						|
			/* Then set the key to look for a new indirect item (offset of old
 | 
						|
			   item is added to old item length */
 | 
						|
			set_cpu_key_k_offset(&key,
 | 
						|
					     le_key_k_offset
 | 
						|
					     (get_inode_item_key_version(inode),
 | 
						|
					      &(ih->ih_key)) +
 | 
						|
					     op_bytes_number(ih,
 | 
						|
							     inode->i_sb->
 | 
						|
							     s_blocksize));
 | 
						|
			/* Search ofor position of new key in the tree. */
 | 
						|
			res =
 | 
						|
			    search_for_position_by_key(inode->i_sb, &key,
 | 
						|
						       &path);
 | 
						|
			if (res == IO_ERROR) {
 | 
						|
				res = -EIO;
 | 
						|
				goto error_exit_free_blocks;
 | 
						|
			}
 | 
						|
			bh = get_last_bh(&path);
 | 
						|
			ih = get_ih(&path);
 | 
						|
			item = get_item(&path);
 | 
						|
			itempos = path.pos_in_item;
 | 
						|
			continue;	// loop to check all kinds of conditions and so on.
 | 
						|
		}
 | 
						|
		/* Ok, we have correct position in item now, so let's see if it is
 | 
						|
		   representing file hole (blocknumber is zero) and fill it if needed */
 | 
						|
		if (!item[itempos]) {
 | 
						|
			/* Ok, a hole. Now we need to check if we already prepared this
 | 
						|
			   block to be journaled */
 | 
						|
			while (!modifying_this_item) {	// loop until succeed
 | 
						|
				/* Well, this item is not journaled yet, so we must prepare
 | 
						|
				   it for journal first, before we can change it */
 | 
						|
				struct item_head tmp_ih;	// We copy item head of found item,
 | 
						|
				// here to detect if fs changed under
 | 
						|
				// us while we were preparing for
 | 
						|
				// journal.
 | 
						|
				int fs_gen;	// We store fs generation here to find if someone
 | 
						|
				// changes fs under our feet
 | 
						|
 | 
						|
				copy_item_head(&tmp_ih, ih);	// Remember itemhead
 | 
						|
				fs_gen = get_generation(inode->i_sb);	// remember fs generation
 | 
						|
				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
 | 
						|
				if (fs_changed(fs_gen, inode->i_sb)
 | 
						|
				    && item_moved(&tmp_ih, &path)) {
 | 
						|
					// Sigh, fs was changed under us, we need to look for new
 | 
						|
					// location of item we are working with
 | 
						|
 | 
						|
					/* unmark prepaerd area as journaled and search for it's
 | 
						|
					   new position */
 | 
						|
					reiserfs_restore_prepared_buffer(inode->
 | 
						|
									 i_sb,
 | 
						|
									 bh);
 | 
						|
					res =
 | 
						|
					    search_for_position_by_key(inode->
 | 
						|
								       i_sb,
 | 
						|
								       &key,
 | 
						|
								       &path);
 | 
						|
					if (res == IO_ERROR) {
 | 
						|
						res = -EIO;
 | 
						|
						goto error_exit_free_blocks;
 | 
						|
					}
 | 
						|
					bh = get_last_bh(&path);
 | 
						|
					ih = get_ih(&path);
 | 
						|
					item = get_item(&path);
 | 
						|
					itempos = path.pos_in_item;
 | 
						|
					goto retry;
 | 
						|
				}
 | 
						|
				modifying_this_item = 1;
 | 
						|
			}
 | 
						|
			item[itempos] = allocated_blocks[curr_block];	// Assign new block
 | 
						|
			curr_block++;
 | 
						|
		}
 | 
						|
		itempos++;
 | 
						|
	}
 | 
						|
 | 
						|
	if (modifying_this_item) {	// We need to log last-accessed block, if it
 | 
						|
		// was modified, but not logged yet.
 | 
						|
		journal_mark_dirty(th, inode->i_sb, bh);
 | 
						|
	}
 | 
						|
 | 
						|
	if (curr_block < blocks_to_allocate) {
 | 
						|
		// Oh, well need to append to indirect item, or to create indirect item
 | 
						|
		// if there weren't any
 | 
						|
		if (is_indirect_le_ih(ih)) {
 | 
						|
			// Existing indirect item - append. First calculate key for append
 | 
						|
			// position. We do not need to recalculate path as it should
 | 
						|
			// already point to correct place.
 | 
						|
			make_cpu_key(&key, inode,
 | 
						|
				     le_key_k_offset(get_inode_item_key_version
 | 
						|
						     (inode),
 | 
						|
						     &(ih->ih_key)) +
 | 
						|
				     op_bytes_number(ih,
 | 
						|
						     inode->i_sb->s_blocksize),
 | 
						|
				     TYPE_INDIRECT, 3);
 | 
						|
			res =
 | 
						|
			    reiserfs_paste_into_item(th, &path, &key, inode,
 | 
						|
						     (char *)(allocated_blocks +
 | 
						|
							      curr_block),
 | 
						|
						     UNFM_P_SIZE *
 | 
						|
						     (blocks_to_allocate -
 | 
						|
						      curr_block));
 | 
						|
			if (res) {
 | 
						|
				goto error_exit_free_blocks;
 | 
						|
			}
 | 
						|
		} else if (is_statdata_le_ih(ih)) {
 | 
						|
			// Last found item was statdata. That means we need to create indirect item.
 | 
						|
			struct item_head ins_ih;	/* itemhead for new item */
 | 
						|
 | 
						|
			/* create a key for our new item */
 | 
						|
			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
 | 
						|
			// because that's
 | 
						|
			// where first
 | 
						|
			// indirect item
 | 
						|
			// begins
 | 
						|
			/* Create new item head for our new item */
 | 
						|
			make_le_item_head(&ins_ih, &key, key.version, 1,
 | 
						|
					  TYPE_INDIRECT,
 | 
						|
					  (blocks_to_allocate -
 | 
						|
					   curr_block) * UNFM_P_SIZE,
 | 
						|
					  0 /* free space */ );
 | 
						|
			/* Find where such item should live in the tree */
 | 
						|
			res = search_item(inode->i_sb, &key, &path);
 | 
						|
			if (res != ITEM_NOT_FOUND) {
 | 
						|
				/* Well, if we have found such item already, or some error
 | 
						|
				   occured, we need to warn user and return error */
 | 
						|
				if (res != -ENOSPC) {
 | 
						|
					reiserfs_warning(inode->i_sb,
 | 
						|
							 "green-9009: search_by_key (%K) "
 | 
						|
							 "returned %d", &key,
 | 
						|
							 res);
 | 
						|
				}
 | 
						|
				res = -EIO;
 | 
						|
				goto error_exit_free_blocks;
 | 
						|
			}
 | 
						|
			/* Insert item into the tree with the data as its body */
 | 
						|
			res =
 | 
						|
			    reiserfs_insert_item(th, &path, &key, &ins_ih,
 | 
						|
						 inode,
 | 
						|
						 (char *)(allocated_blocks +
 | 
						|
							  curr_block));
 | 
						|
		} else {
 | 
						|
			reiserfs_panic(inode->i_sb,
 | 
						|
				       "green-9010: unexpected item type for key %K\n",
 | 
						|
				       &key);
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// the caller is responsible for closing the transaction
 | 
						|
	// unless we return an error, they are also responsible for logging
 | 
						|
	// the inode.
 | 
						|
	//
 | 
						|
	pathrelse(&path);
 | 
						|
	/*
 | 
						|
	 * cleanup prellocation from previous writes
 | 
						|
	 * if this is a partial block write
 | 
						|
	 */
 | 
						|
	if (write_bytes & (inode->i_sb->s_blocksize - 1))
 | 
						|
		reiserfs_discard_prealloc(th, inode);
 | 
						|
	reiserfs_write_unlock(inode->i_sb);
 | 
						|
 | 
						|
	// go through all the pages/buffers and map the buffers to newly allocated
 | 
						|
	// blocks (so that system knows where to write these pages later).
 | 
						|
	curr_block = 0;
 | 
						|
	for (i = 0; i < num_pages; i++) {
 | 
						|
		struct page *page = prepared_pages[i];	//current page
 | 
						|
		struct buffer_head *head = page_buffers(page);	// first buffer for a page
 | 
						|
		int block_start, block_end;	// in-page offsets for buffers.
 | 
						|
 | 
						|
		if (!page_buffers(page))
 | 
						|
			reiserfs_panic(inode->i_sb,
 | 
						|
				       "green-9005: No buffers for prepared page???");
 | 
						|
 | 
						|
		/* For each buffer in page */
 | 
						|
		for (bh = head, block_start = 0; bh != head || !block_start;
 | 
						|
		     block_start = block_end, bh = bh->b_this_page) {
 | 
						|
			if (!bh)
 | 
						|
				reiserfs_panic(inode->i_sb,
 | 
						|
					       "green-9006: Allocated but absent buffer for a page?");
 | 
						|
			block_end = block_start + inode->i_sb->s_blocksize;
 | 
						|
			if (i == 0 && block_end <= from)
 | 
						|
				/* if this buffer is before requested data to map, skip it */
 | 
						|
				continue;
 | 
						|
			if (i == num_pages - 1 && block_start >= to)
 | 
						|
				/* If this buffer is after requested data to map, abort
 | 
						|
				   processing of current page */
 | 
						|
				break;
 | 
						|
 | 
						|
			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
 | 
						|
				map_bh(bh, inode->i_sb,
 | 
						|
				       le32_to_cpu(allocated_blocks
 | 
						|
						   [curr_block]));
 | 
						|
				curr_block++;
 | 
						|
				set_buffer_new(bh);
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	RFALSE(curr_block > blocks_to_allocate,
 | 
						|
	       "green-9007: Used too many blocks? weird");
 | 
						|
 | 
						|
	kfree(allocated_blocks);
 | 
						|
	return 0;
 | 
						|
 | 
						|
// Need to deal with transaction here.
 | 
						|
      error_exit_free_blocks:
 | 
						|
	pathrelse(&path);
 | 
						|
	// free blocks
 | 
						|
	for (i = 0; i < blocks_to_allocate; i++)
 | 
						|
		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
 | 
						|
				    1);
 | 
						|
 | 
						|
      error_exit:
 | 
						|
	if (th->t_trans_id) {
 | 
						|
		int err;
 | 
						|
		// update any changes we made to blk count
 | 
						|
		mark_inode_dirty(inode);
 | 
						|
		err =
 | 
						|
		    journal_end(th, inode->i_sb,
 | 
						|
				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
 | 
						|
				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
 | 
						|
		if (err)
 | 
						|
			res = err;
 | 
						|
	}
 | 
						|
	reiserfs_write_unlock(inode->i_sb);
 | 
						|
	kfree(allocated_blocks);
 | 
						|
 | 
						|
	return res;
 | 
						|
}
 | 
						|
 | 
						|
/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
 | 
						|
static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
 | 
						|
				     size_t num_pages /* amount of pages */ )
 | 
						|
{
 | 
						|
	int i;			// loop counter
 | 
						|
 | 
						|
	for (i = 0; i < num_pages; i++) {
 | 
						|
		struct page *page = prepared_pages[i];
 | 
						|
 | 
						|
		try_to_free_buffers(page);
 | 
						|
		unlock_page(page);
 | 
						|
		page_cache_release(page);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/* This function will copy data from userspace to specified pages within
 | 
						|
   supplied byte range */
 | 
						|
static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
 | 
						|
						  int num_pages,	/* Number of pages affected */
 | 
						|
						  int write_bytes,	/* Amount of bytes to write */
 | 
						|
						  struct page **prepared_pages,	/* pointer to 
 | 
						|
										   array to
 | 
						|
										   prepared pages
 | 
						|
										 */
 | 
						|
						  const char __user * buf	/* Pointer to user-supplied
 | 
						|
										   data */
 | 
						|
    )
 | 
						|
{
 | 
						|
	long page_fault = 0;	// status of copy_from_user.
 | 
						|
	int i;			// loop counter.
 | 
						|
	int offset;		// offset in page
 | 
						|
 | 
						|
	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
 | 
						|
	     i++, offset = 0) {
 | 
						|
		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
 | 
						|
		struct page *page = prepared_pages[i];	// Current page we process.
 | 
						|
 | 
						|
		fault_in_pages_readable(buf, count);
 | 
						|
 | 
						|
		/* Copy data from userspace to the current page */
 | 
						|
		kmap(page);
 | 
						|
		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
 | 
						|
		/* Flush processor's dcache for this page */
 | 
						|
		flush_dcache_page(page);
 | 
						|
		kunmap(page);
 | 
						|
		buf += count;
 | 
						|
		write_bytes -= count;
 | 
						|
 | 
						|
		if (page_fault)
 | 
						|
			break;	// Was there a fault? abort.
 | 
						|
	}
 | 
						|
 | 
						|
	return page_fault ? -EFAULT : 0;
 | 
						|
}
 | 
						|
 | 
						|
/* taken fs/buffer.c:__block_commit_write */
 | 
						|
int reiserfs_commit_page(struct inode *inode, struct page *page,
 | 
						|
			 unsigned from, unsigned to)
 | 
						|
{
 | 
						|
	unsigned block_start, block_end;
 | 
						|
	int partial = 0;
 | 
						|
	unsigned blocksize;
 | 
						|
	struct buffer_head *bh, *head;
 | 
						|
	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
 | 
						|
	int new;
 | 
						|
	int logit = reiserfs_file_data_log(inode);
 | 
						|
	struct super_block *s = inode->i_sb;
 | 
						|
	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 | 
						|
	struct reiserfs_transaction_handle th;
 | 
						|
	int ret = 0;
 | 
						|
 | 
						|
	th.t_trans_id = 0;
 | 
						|
	blocksize = 1 << inode->i_blkbits;
 | 
						|
 | 
						|
	if (logit) {
 | 
						|
		reiserfs_write_lock(s);
 | 
						|
		ret = journal_begin(&th, s, bh_per_page + 1);
 | 
						|
		if (ret)
 | 
						|
			goto drop_write_lock;
 | 
						|
		reiserfs_update_inode_transaction(inode);
 | 
						|
	}
 | 
						|
	for (bh = head = page_buffers(page), block_start = 0;
 | 
						|
	     bh != head || !block_start;
 | 
						|
	     block_start = block_end, bh = bh->b_this_page) {
 | 
						|
 | 
						|
		new = buffer_new(bh);
 | 
						|
		clear_buffer_new(bh);
 | 
						|
		block_end = block_start + blocksize;
 | 
						|
		if (block_end <= from || block_start >= to) {
 | 
						|
			if (!buffer_uptodate(bh))
 | 
						|
				partial = 1;
 | 
						|
		} else {
 | 
						|
			set_buffer_uptodate(bh);
 | 
						|
			if (logit) {
 | 
						|
				reiserfs_prepare_for_journal(s, bh, 1);
 | 
						|
				journal_mark_dirty(&th, s, bh);
 | 
						|
			} else if (!buffer_dirty(bh)) {
 | 
						|
				mark_buffer_dirty(bh);
 | 
						|
				/* do data=ordered on any page past the end
 | 
						|
				 * of file and any buffer marked BH_New.
 | 
						|
				 */
 | 
						|
				if (reiserfs_data_ordered(inode->i_sb) &&
 | 
						|
				    (new || page->index >= i_size_index)) {
 | 
						|
					reiserfs_add_ordered_list(inode, bh);
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if (logit) {
 | 
						|
		ret = journal_end(&th, s, bh_per_page + 1);
 | 
						|
	      drop_write_lock:
 | 
						|
		reiserfs_write_unlock(s);
 | 
						|
	}
 | 
						|
	/*
 | 
						|
	 * If this is a partial write which happened to make all buffers
 | 
						|
	 * uptodate then we can optimize away a bogus readpage() for
 | 
						|
	 * the next read(). Here we 'discover' whether the page went
 | 
						|
	 * uptodate as a result of this (potentially partial) write.
 | 
						|
	 */
 | 
						|
	if (!partial)
 | 
						|
		SetPageUptodate(page);
 | 
						|
	return ret;
 | 
						|
}
 | 
						|
 | 
						|
/* Submit pages for write. This was separated from actual file copying
 | 
						|
   because we might want to allocate block numbers in-between.
 | 
						|
   This function assumes that caller will adjust file size to correct value. */
 | 
						|
static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
 | 
						|
						 size_t num_pages,	/* Number of pages to write */
 | 
						|
						 size_t write_bytes,	/* number of bytes to write */
 | 
						|
						 struct page **prepared_pages	/* list of pages */
 | 
						|
    )
 | 
						|
{
 | 
						|
	int status;		// return status of block_commit_write.
 | 
						|
	int retval = 0;		// Return value we are going to return.
 | 
						|
	int i;			// loop counter
 | 
						|
	int offset;		// Writing offset in page.
 | 
						|
	int orig_write_bytes = write_bytes;
 | 
						|
	int sd_update = 0;
 | 
						|
 | 
						|
	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
 | 
						|
	     i++, offset = 0) {
 | 
						|
		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
 | 
						|
		struct page *page = prepared_pages[i];	// Current page we process.
 | 
						|
 | 
						|
		status =
 | 
						|
		    reiserfs_commit_page(inode, page, offset, offset + count);
 | 
						|
		if (status)
 | 
						|
			retval = status;	// To not overcomplicate matters We are going to
 | 
						|
		// submit all the pages even if there was error.
 | 
						|
		// we only remember error status to report it on
 | 
						|
		// exit.
 | 
						|
		write_bytes -= count;
 | 
						|
	}
 | 
						|
	/* now that we've gotten all the ordered buffers marked dirty,
 | 
						|
	 * we can safely update i_size and close any running transaction
 | 
						|
	 */
 | 
						|
	if (pos + orig_write_bytes > inode->i_size) {
 | 
						|
		inode->i_size = pos + orig_write_bytes;	// Set new size
 | 
						|
		/* If the file have grown so much that tail packing is no
 | 
						|
		 * longer possible, reset "need to pack" flag */
 | 
						|
		if ((have_large_tails(inode->i_sb) &&
 | 
						|
		     inode->i_size > i_block_size(inode) * 4) ||
 | 
						|
		    (have_small_tails(inode->i_sb) &&
 | 
						|
		     inode->i_size > i_block_size(inode)))
 | 
						|
			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 | 
						|
		else if ((have_large_tails(inode->i_sb) &&
 | 
						|
			  inode->i_size < i_block_size(inode) * 4) ||
 | 
						|
			 (have_small_tails(inode->i_sb) &&
 | 
						|
			  inode->i_size < i_block_size(inode)))
 | 
						|
			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
 | 
						|
 | 
						|
		if (th->t_trans_id) {
 | 
						|
			reiserfs_write_lock(inode->i_sb);
 | 
						|
			// this sets the proper flags for O_SYNC to trigger a commit
 | 
						|
			mark_inode_dirty(inode);
 | 
						|
			reiserfs_write_unlock(inode->i_sb);
 | 
						|
		} else
 | 
						|
			mark_inode_dirty(inode);
 | 
						|
 | 
						|
		sd_update = 1;
 | 
						|
	}
 | 
						|
	if (th->t_trans_id) {
 | 
						|
		reiserfs_write_lock(inode->i_sb);
 | 
						|
		if (!sd_update)
 | 
						|
			mark_inode_dirty(inode);
 | 
						|
		status = journal_end(th, th->t_super, th->t_blocks_allocated);
 | 
						|
		if (status)
 | 
						|
			retval = status;
 | 
						|
		reiserfs_write_unlock(inode->i_sb);
 | 
						|
	}
 | 
						|
	th->t_trans_id = 0;
 | 
						|
 | 
						|
	/* 
 | 
						|
	 * we have to unlock the pages after updating i_size, otherwise
 | 
						|
	 * we race with writepage
 | 
						|
	 */
 | 
						|
	for (i = 0; i < num_pages; i++) {
 | 
						|
		struct page *page = prepared_pages[i];
 | 
						|
		unlock_page(page);
 | 
						|
		mark_page_accessed(page);
 | 
						|
		page_cache_release(page);
 | 
						|
	}
 | 
						|
	return retval;
 | 
						|
}
 | 
						|
 | 
						|
/* Look if passed writing region is going to touch file's tail
 | 
						|
   (if it is present). And if it is, convert the tail to unformatted node */
 | 
						|
static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
 | 
						|
					       loff_t pos,	/* Writing position */
 | 
						|
					       int write_bytes	/* amount of bytes to write */
 | 
						|
    )
 | 
						|
{
 | 
						|
	INITIALIZE_PATH(path);	// needed for search_for_position
 | 
						|
	struct cpu_key key;	// Key that would represent last touched writing byte.
 | 
						|
	struct item_head *ih;	// item header of found block;
 | 
						|
	int res;		// Return value of various functions we call.
 | 
						|
	int cont_expand_offset;	// We will put offset for generic_cont_expand here
 | 
						|
	// This can be int just because tails are created
 | 
						|
	// only for small files.
 | 
						|
 | 
						|
/* this embodies a dependency on a particular tail policy */
 | 
						|
	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
 | 
						|
		/* such a big files do not have tails, so we won't bother ourselves
 | 
						|
		   to look for tails, simply return */
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	reiserfs_write_lock(inode->i_sb);
 | 
						|
	/* find the item containing the last byte to be written, or if
 | 
						|
	 * writing past the end of the file then the last item of the
 | 
						|
	 * file (and then we check its type). */
 | 
						|
	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
 | 
						|
		     3 /*key length */ );
 | 
						|
	res = search_for_position_by_key(inode->i_sb, &key, &path);
 | 
						|
	if (res == IO_ERROR) {
 | 
						|
		reiserfs_write_unlock(inode->i_sb);
 | 
						|
		return -EIO;
 | 
						|
	}
 | 
						|
	ih = get_ih(&path);
 | 
						|
	res = 0;
 | 
						|
	if (is_direct_le_ih(ih)) {
 | 
						|
		/* Ok, closest item is file tail (tails are stored in "direct"
 | 
						|
		 * items), so we need to unpack it. */
 | 
						|
		/* To not overcomplicate matters, we just call generic_cont_expand
 | 
						|
		   which will in turn call other stuff and finally will boil down to
 | 
						|
		   reiserfs_get_block() that would do necessary conversion. */
 | 
						|
		cont_expand_offset =
 | 
						|
		    le_key_k_offset(get_inode_item_key_version(inode),
 | 
						|
				    &(ih->ih_key));
 | 
						|
		pathrelse(&path);
 | 
						|
		res = generic_cont_expand(inode, cont_expand_offset);
 | 
						|
	} else
 | 
						|
		pathrelse(&path);
 | 
						|
 | 
						|
	reiserfs_write_unlock(inode->i_sb);
 | 
						|
	return res;
 | 
						|
}
 | 
						|
 | 
						|
/* This function locks pages starting from @pos for @inode.
 | 
						|
   @num_pages pages are locked and stored in
 | 
						|
   @prepared_pages array. Also buffers are allocated for these pages.
 | 
						|
   First and last page of the region is read if it is overwritten only
 | 
						|
   partially. If last page did not exist before write (file hole or file
 | 
						|
   append), it is zeroed, then. 
 | 
						|
   Returns number of unallocated blocks that should be allocated to cover
 | 
						|
   new file data.*/
 | 
						|
static int reiserfs_prepare_file_region_for_write(struct inode *inode
 | 
						|
						  /* Inode of the file */ ,
 | 
						|
						  loff_t pos,	/* position in the file */
 | 
						|
						  size_t num_pages,	/* number of pages to
 | 
						|
									   prepare */
 | 
						|
						  size_t write_bytes,	/* Amount of bytes to be
 | 
						|
									   overwritten from
 | 
						|
									   @pos */
 | 
						|
						  struct page **prepared_pages	/* pointer to array
 | 
						|
										   where to store
 | 
						|
										   prepared pages */
 | 
						|
    )
 | 
						|
{
 | 
						|
	int res = 0;		// Return values of different functions we call.
 | 
						|
	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
 | 
						|
	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
 | 
						|
	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
 | 
						|
	/* offset of last modified byte in last
 | 
						|
	   page */
 | 
						|
	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
 | 
						|
	int i;			// Simple counter
 | 
						|
	int blocks = 0;		/* Return value (blocks that should be allocated) */
 | 
						|
	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
 | 
						|
	// of a page.
 | 
						|
	unsigned block_start, block_end;	// Starting and ending offsets of current
 | 
						|
	// buffer in the page.
 | 
						|
	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
 | 
						|
	// Page appeared to be not up
 | 
						|
	// to date. Note how we have
 | 
						|
	// at most 2 buffers, this is
 | 
						|
	// because we at most may
 | 
						|
	// partially overwrite two
 | 
						|
	// buffers for one page. One at                                                 // the beginning of write area
 | 
						|
	// and one at the end.
 | 
						|
	// Everything inthe middle gets                                                 // overwritten totally.
 | 
						|
 | 
						|
	struct cpu_key key;	// cpu key of item that we are going to deal with
 | 
						|
	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
 | 
						|
	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
 | 
						|
	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
 | 
						|
	__le32 *item = NULL;	// pointer to item we are going to deal with
 | 
						|
	int item_pos = -1;	/* Position in indirect item */
 | 
						|
 | 
						|
	if (num_pages < 1) {
 | 
						|
		reiserfs_warning(inode->i_sb,
 | 
						|
				 "green-9001: reiserfs_prepare_file_region_for_write "
 | 
						|
				 "called with zero number of pages to process");
 | 
						|
		return -EFAULT;
 | 
						|
	}
 | 
						|
 | 
						|
	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
 | 
						|
	   that nobody would touch these until we release the pages. Then
 | 
						|
	   we'd start to deal with mapping buffers to blocks. */
 | 
						|
	for (i = 0; i < num_pages; i++) {
 | 
						|
		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
 | 
						|
		if (!prepared_pages[i]) {
 | 
						|
			res = -ENOMEM;
 | 
						|
			goto failed_page_grabbing;
 | 
						|
		}
 | 
						|
		if (!page_has_buffers(prepared_pages[i]))
 | 
						|
			create_empty_buffers(prepared_pages[i],
 | 
						|
					     inode->i_sb->s_blocksize, 0);
 | 
						|
	}
 | 
						|
 | 
						|
	/* Let's count amount of blocks for a case where all the blocks
 | 
						|
	   overwritten are new (we will substract already allocated blocks later) */
 | 
						|
	if (num_pages > 2)
 | 
						|
		/* These are full-overwritten pages so we count all the blocks in
 | 
						|
		   these pages are counted as needed to be allocated */
 | 
						|
		blocks =
 | 
						|
		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 | 
						|
 | 
						|
	/* count blocks needed for first page (possibly partially written) */
 | 
						|
	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
 | 
						|
 | 
						|
	/* Now we account for last page. If last page == first page (we
 | 
						|
	   overwrite only one page), we substract all the blocks past the
 | 
						|
	   last writing position in a page out of already calculated number
 | 
						|
	   of blocks */
 | 
						|
	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
 | 
						|
	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
 | 
						|
	/* Note how we do not roundup here since partial blocks still
 | 
						|
	   should be allocated */
 | 
						|
 | 
						|
	/* Now if all the write area lies past the file end, no point in
 | 
						|
	   maping blocks, since there is none, so we just zero out remaining
 | 
						|
	   parts of first and last pages in write area (if needed) */
 | 
						|
	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
 | 
						|
		if (from != 0) {	/* First page needs to be partially zeroed */
 | 
						|
			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
 | 
						|
			memset(kaddr, 0, from);
 | 
						|
			kunmap_atomic(kaddr, KM_USER0);
 | 
						|
		}
 | 
						|
		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
 | 
						|
			char *kaddr =
 | 
						|
			    kmap_atomic(prepared_pages[num_pages - 1],
 | 
						|
					KM_USER0);
 | 
						|
			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
 | 
						|
			kunmap_atomic(kaddr, KM_USER0);
 | 
						|
		}
 | 
						|
 | 
						|
		/* Since all blocks are new - use already calculated value */
 | 
						|
		return blocks;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Well, since we write somewhere into the middle of a file, there is
 | 
						|
	   possibility we are writing over some already allocated blocks, so
 | 
						|
	   let's map these blocks and substract number of such blocks out of blocks
 | 
						|
	   we need to allocate (calculated above) */
 | 
						|
	/* Mask write position to start on blocksize, we do it out of the
 | 
						|
	   loop for performance reasons */
 | 
						|
	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
 | 
						|
	/* Set cpu key to the starting position in a file (on left block boundary) */
 | 
						|
	make_cpu_key(&key, inode,
 | 
						|
		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
 | 
						|
		     TYPE_ANY, 3 /*key length */ );
 | 
						|
 | 
						|
	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
 | 
						|
	for (i = 0; i < num_pages; i++) {
 | 
						|
 | 
						|
		head = page_buffers(prepared_pages[i]);
 | 
						|
		/* For each buffer in the page */
 | 
						|
		for (bh = head, block_start = 0; bh != head || !block_start;
 | 
						|
		     block_start = block_end, bh = bh->b_this_page) {
 | 
						|
			if (!bh)
 | 
						|
				reiserfs_panic(inode->i_sb,
 | 
						|
					       "green-9002: Allocated but absent buffer for a page?");
 | 
						|
			/* Find where this buffer ends */
 | 
						|
			block_end = block_start + inode->i_sb->s_blocksize;
 | 
						|
			if (i == 0 && block_end <= from)
 | 
						|
				/* if this buffer is before requested data to map, skip it */
 | 
						|
				continue;
 | 
						|
 | 
						|
			if (i == num_pages - 1 && block_start >= to) {
 | 
						|
				/* If this buffer is after requested data to map, abort
 | 
						|
				   processing of current page */
 | 
						|
				break;
 | 
						|
			}
 | 
						|
 | 
						|
			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 | 
						|
				/* This is optimisation for a case where buffer is mapped
 | 
						|
				   and have blocknumber assigned. In case significant amount
 | 
						|
				   of such buffers are present, we may avoid some amount
 | 
						|
				   of search_by_key calls.
 | 
						|
				   Probably it would be possible to move parts of this code
 | 
						|
				   out of BKL, but I afraid that would overcomplicate code
 | 
						|
				   without any noticeable benefit.
 | 
						|
				 */
 | 
						|
				item_pos++;
 | 
						|
				/* Update the key */
 | 
						|
				set_cpu_key_k_offset(&key,
 | 
						|
						     cpu_key_k_offset(&key) +
 | 
						|
						     inode->i_sb->s_blocksize);
 | 
						|
				blocks--;	// Decrease the amount of blocks that need to be
 | 
						|
				// allocated
 | 
						|
				continue;	// Go to the next buffer
 | 
						|
			}
 | 
						|
 | 
						|
			if (!itembuf ||	/* if first iteration */
 | 
						|
			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
 | 
						|
										   current unformatted_item */
 | 
						|
				/* Try to find next item */
 | 
						|
				res =
 | 
						|
				    search_for_position_by_key(inode->i_sb,
 | 
						|
							       &key, &path);
 | 
						|
				/* Abort if no more items */
 | 
						|
				if (res != POSITION_FOUND) {
 | 
						|
					/* make sure later loops don't use this item */
 | 
						|
					itembuf = NULL;
 | 
						|
					item = NULL;
 | 
						|
					break;
 | 
						|
				}
 | 
						|
 | 
						|
				/* Update information about current indirect item */
 | 
						|
				itembuf = get_last_bh(&path);
 | 
						|
				ih = get_ih(&path);
 | 
						|
				item = get_item(&path);
 | 
						|
				item_pos = path.pos_in_item;
 | 
						|
 | 
						|
				RFALSE(!is_indirect_le_ih(ih),
 | 
						|
				       "green-9003: indirect item expected");
 | 
						|
			}
 | 
						|
 | 
						|
			/* See if there is some block associated with the file
 | 
						|
			   at that position, map the buffer to this block */
 | 
						|
			if (get_block_num(item, item_pos)) {
 | 
						|
				map_bh(bh, inode->i_sb,
 | 
						|
				       get_block_num(item, item_pos));
 | 
						|
				blocks--;	// Decrease the amount of blocks that need to be
 | 
						|
				// allocated
 | 
						|
			}
 | 
						|
			item_pos++;
 | 
						|
			/* Update the key */
 | 
						|
			set_cpu_key_k_offset(&key,
 | 
						|
					     cpu_key_k_offset(&key) +
 | 
						|
					     inode->i_sb->s_blocksize);
 | 
						|
		}
 | 
						|
	}
 | 
						|
	pathrelse(&path);	// Free the path
 | 
						|
	reiserfs_write_unlock(inode->i_sb);
 | 
						|
 | 
						|
	/* Now zero out unmappend buffers for the first and last pages of
 | 
						|
	   write area or issue read requests if page is mapped. */
 | 
						|
	/* First page, see if it is not uptodate */
 | 
						|
	if (!PageUptodate(prepared_pages[0])) {
 | 
						|
		head = page_buffers(prepared_pages[0]);
 | 
						|
 | 
						|
		/* For each buffer in page */
 | 
						|
		for (bh = head, block_start = 0; bh != head || !block_start;
 | 
						|
		     block_start = block_end, bh = bh->b_this_page) {
 | 
						|
 | 
						|
			if (!bh)
 | 
						|
				reiserfs_panic(inode->i_sb,
 | 
						|
					       "green-9002: Allocated but absent buffer for a page?");
 | 
						|
			/* Find where this buffer ends */
 | 
						|
			block_end = block_start + inode->i_sb->s_blocksize;
 | 
						|
			if (block_end <= from)
 | 
						|
				/* if this buffer is before requested data to map, skip it */
 | 
						|
				continue;
 | 
						|
			if (block_start < from) {	/* Aha, our partial buffer */
 | 
						|
				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
 | 
						|
								   issue READ request for it to
 | 
						|
								   not loose data */
 | 
						|
					ll_rw_block(READ, 1, &bh);
 | 
						|
					*wait_bh++ = bh;
 | 
						|
				} else {	/* Not mapped, zero it */
 | 
						|
					char *kaddr =
 | 
						|
					    kmap_atomic(prepared_pages[0],
 | 
						|
							KM_USER0);
 | 
						|
					memset(kaddr + block_start, 0,
 | 
						|
					       from - block_start);
 | 
						|
					kunmap_atomic(kaddr, KM_USER0);
 | 
						|
					set_buffer_uptodate(bh);
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
 | 
						|
	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
 | 
						|
	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
 | 
						|
	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
 | 
						|
		head = page_buffers(prepared_pages[num_pages - 1]);
 | 
						|
 | 
						|
		/* for each buffer in page */
 | 
						|
		for (bh = head, block_start = 0; bh != head || !block_start;
 | 
						|
		     block_start = block_end, bh = bh->b_this_page) {
 | 
						|
 | 
						|
			if (!bh)
 | 
						|
				reiserfs_panic(inode->i_sb,
 | 
						|
					       "green-9002: Allocated but absent buffer for a page?");
 | 
						|
			/* Find where this buffer ends */
 | 
						|
			block_end = block_start + inode->i_sb->s_blocksize;
 | 
						|
			if (block_start >= to)
 | 
						|
				/* if this buffer is after requested data to map, skip it */
 | 
						|
				break;
 | 
						|
			if (block_end > to) {	/* Aha, our partial buffer */
 | 
						|
				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
 | 
						|
								   issue READ request for it to
 | 
						|
								   not loose data */
 | 
						|
					ll_rw_block(READ, 1, &bh);
 | 
						|
					*wait_bh++ = bh;
 | 
						|
				} else {	/* Not mapped, zero it */
 | 
						|
					char *kaddr =
 | 
						|
					    kmap_atomic(prepared_pages
 | 
						|
							[num_pages - 1],
 | 
						|
							KM_USER0);
 | 
						|
					memset(kaddr + to, 0, block_end - to);
 | 
						|
					kunmap_atomic(kaddr, KM_USER0);
 | 
						|
					set_buffer_uptodate(bh);
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* Wait for read requests we made to happen, if necessary */
 | 
						|
	while (wait_bh > wait) {
 | 
						|
		wait_on_buffer(*--wait_bh);
 | 
						|
		if (!buffer_uptodate(*wait_bh)) {
 | 
						|
			res = -EIO;
 | 
						|
			goto failed_read;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return blocks;
 | 
						|
      failed_page_grabbing:
 | 
						|
	num_pages = i;
 | 
						|
      failed_read:
 | 
						|
	reiserfs_unprepare_pages(prepared_pages, num_pages);
 | 
						|
	return res;
 | 
						|
}
 | 
						|
 | 
						|
/* Write @count bytes at position @ppos in a file indicated by @file
 | 
						|
   from the buffer @buf.  
 | 
						|
 | 
						|
   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
 | 
						|
   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
 | 
						|
   written for (ext2/3).  This is for several reasons:
 | 
						|
 | 
						|
   * It has no understanding of any filesystem specific optimizations.
 | 
						|
 | 
						|
   * It enters the filesystem repeatedly for each page that is written.
 | 
						|
 | 
						|
   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
 | 
						|
   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
 | 
						|
   * to reiserfs which allows for fewer tree traversals.
 | 
						|
 | 
						|
   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
 | 
						|
 | 
						|
   * Asking the block allocation code for blocks one at a time is slightly less efficient.
 | 
						|
 | 
						|
   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
 | 
						|
   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
 | 
						|
   things right finally.
 | 
						|
 | 
						|
   Future Features: providing search_by_key with hints.
 | 
						|
 | 
						|
*/
 | 
						|
static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
 | 
						|
				   const char __user * buf,	/*  pointer to user supplied data
 | 
						|
								   (in userspace) */
 | 
						|
				   size_t count,	/* amount of bytes to write */
 | 
						|
				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
 | 
						|
							 * new current position before returning. */
 | 
						|
				   )
 | 
						|
{
 | 
						|
	size_t already_written = 0;	// Number of bytes already written to the file.
 | 
						|
	loff_t pos;		// Current position in the file.
 | 
						|
	ssize_t res;		// return value of various functions that we call.
 | 
						|
	int err = 0;
 | 
						|
	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
 | 
						|
	/* To simplify coding at this time, we store
 | 
						|
	   locked pages in array for now */
 | 
						|
	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 | 
						|
	struct reiserfs_transaction_handle th;
 | 
						|
	th.t_trans_id = 0;
 | 
						|
 | 
						|
	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
 | 
						|
	* lying around (most of the disk, in fact). Despite the filesystem
 | 
						|
	* now being a v3.6 format, the old items still can't support large
 | 
						|
	* file sizes. Catch this case here, as the rest of the VFS layer is
 | 
						|
	* oblivious to the different limitations between old and new items.
 | 
						|
	* reiserfs_setattr catches this for truncates. This chunk is lifted
 | 
						|
	* from generic_write_checks. */
 | 
						|
	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
 | 
						|
	    *ppos + count > MAX_NON_LFS) {
 | 
						|
		if (*ppos >= MAX_NON_LFS) {
 | 
						|
			send_sig(SIGXFSZ, current, 0);
 | 
						|
			return -EFBIG;
 | 
						|
		}
 | 
						|
		if (count > MAX_NON_LFS - (unsigned long)*ppos)
 | 
						|
			count = MAX_NON_LFS - (unsigned long)*ppos;
 | 
						|
	}
 | 
						|
 | 
						|
	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
 | 
						|
		ssize_t result, after_file_end = 0;
 | 
						|
		if ((*ppos + count >= inode->i_size)
 | 
						|
		    || (file->f_flags & O_APPEND)) {
 | 
						|
			/* If we are appending a file, we need to put this savelink in here.
 | 
						|
			   If we will crash while doing direct io, finish_unfinished will
 | 
						|
			   cut the garbage from the file end. */
 | 
						|
			reiserfs_write_lock(inode->i_sb);
 | 
						|
			err =
 | 
						|
			    journal_begin(&th, inode->i_sb,
 | 
						|
					  JOURNAL_PER_BALANCE_CNT);
 | 
						|
			if (err) {
 | 
						|
				reiserfs_write_unlock(inode->i_sb);
 | 
						|
				return err;
 | 
						|
			}
 | 
						|
			reiserfs_update_inode_transaction(inode);
 | 
						|
			add_save_link(&th, inode, 1 /* Truncate */ );
 | 
						|
			after_file_end = 1;
 | 
						|
			err =
 | 
						|
			    journal_end(&th, inode->i_sb,
 | 
						|
					JOURNAL_PER_BALANCE_CNT);
 | 
						|
			reiserfs_write_unlock(inode->i_sb);
 | 
						|
			if (err)
 | 
						|
				return err;
 | 
						|
		}
 | 
						|
		result = generic_file_write(file, buf, count, ppos);
 | 
						|
 | 
						|
		if (after_file_end) {	/* Now update i_size and remove the savelink */
 | 
						|
			struct reiserfs_transaction_handle th;
 | 
						|
			reiserfs_write_lock(inode->i_sb);
 | 
						|
			err = journal_begin(&th, inode->i_sb, 1);
 | 
						|
			if (err) {
 | 
						|
				reiserfs_write_unlock(inode->i_sb);
 | 
						|
				return err;
 | 
						|
			}
 | 
						|
			reiserfs_update_inode_transaction(inode);
 | 
						|
			mark_inode_dirty(inode);
 | 
						|
			err = journal_end(&th, inode->i_sb, 1);
 | 
						|
			if (err) {
 | 
						|
				reiserfs_write_unlock(inode->i_sb);
 | 
						|
				return err;
 | 
						|
			}
 | 
						|
			err = remove_save_link(inode, 1 /* truncate */ );
 | 
						|
			reiserfs_write_unlock(inode->i_sb);
 | 
						|
			if (err)
 | 
						|
				return err;
 | 
						|
		}
 | 
						|
 | 
						|
		return result;
 | 
						|
	}
 | 
						|
 | 
						|
	if (unlikely((ssize_t) count < 0))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 | 
						|
		return -EFAULT;
 | 
						|
 | 
						|
	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
 | 
						|
 | 
						|
	pos = *ppos;
 | 
						|
 | 
						|
	/* Check if we can write to specified region of file, file
 | 
						|
	   is not overly big and this kind of stuff. Adjust pos and
 | 
						|
	   count, if needed */
 | 
						|
	res = generic_write_checks(file, &pos, &count, 0);
 | 
						|
	if (res)
 | 
						|
		goto out;
 | 
						|
 | 
						|
	if (count == 0)
 | 
						|
		goto out;
 | 
						|
 | 
						|
	res = remove_suid(file->f_dentry);
 | 
						|
	if (res)
 | 
						|
		goto out;
 | 
						|
 | 
						|
	file_update_time(file);
 | 
						|
 | 
						|
	// Ok, we are done with all the checks.
 | 
						|
 | 
						|
	// Now we should start real work
 | 
						|
 | 
						|
	/* If we are going to write past the file's packed tail or if we are going
 | 
						|
	   to overwrite part of the tail, we need that tail to be converted into
 | 
						|
	   unformatted node */
 | 
						|
	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
 | 
						|
	if (res)
 | 
						|
		goto out;
 | 
						|
 | 
						|
	while (count > 0) {
 | 
						|
		/* This is the main loop in which we running until some error occures
 | 
						|
		   or until we write all of the data. */
 | 
						|
		size_t num_pages;	/* amount of pages we are going to write this iteration */
 | 
						|
		size_t write_bytes;	/* amount of bytes to write during this iteration */
 | 
						|
		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
 | 
						|
 | 
						|
		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
 | 
						|
		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
 | 
						|
									   pages */
 | 
						|
		    ((count +
 | 
						|
		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
 | 
						|
		/* convert size to amount of
 | 
						|
		   pages */
 | 
						|
		reiserfs_write_lock(inode->i_sb);
 | 
						|
		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
 | 
						|
		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
 | 
						|
			/* If we were asked to write more data than we want to or if there
 | 
						|
			   is not that much space, then we shorten amount of data to write
 | 
						|
			   for this iteration. */
 | 
						|
			num_pages =
 | 
						|
			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
 | 
						|
				  reiserfs_can_fit_pages(inode->i_sb));
 | 
						|
			/* Also we should not forget to set size in bytes accordingly */
 | 
						|
			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
 | 
						|
			    (pos & (PAGE_CACHE_SIZE - 1));
 | 
						|
			/* If position is not on the
 | 
						|
			   start of the page, we need
 | 
						|
			   to substract the offset
 | 
						|
			   within page */
 | 
						|
		} else
 | 
						|
			write_bytes = count;
 | 
						|
 | 
						|
		/* reserve the blocks to be allocated later, so that later on
 | 
						|
		   we still have the space to write the blocks to */
 | 
						|
		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
 | 
						|
						      num_pages <<
 | 
						|
						      (PAGE_CACHE_SHIFT -
 | 
						|
						       inode->i_blkbits));
 | 
						|
		reiserfs_write_unlock(inode->i_sb);
 | 
						|
 | 
						|
		if (!num_pages) {	/* If we do not have enough space even for a single page... */
 | 
						|
			if (pos >
 | 
						|
			    inode->i_size + inode->i_sb->s_blocksize -
 | 
						|
			    (pos & (inode->i_sb->s_blocksize - 1))) {
 | 
						|
				res = -ENOSPC;
 | 
						|
				break;	// In case we are writing past the end of the last file block, break.
 | 
						|
			}
 | 
						|
			// Otherwise we are possibly overwriting the file, so
 | 
						|
			// let's set write size to be equal or less than blocksize.
 | 
						|
			// This way we get it correctly for file holes.
 | 
						|
			// But overwriting files on absolutelly full volumes would not
 | 
						|
			// be very efficient. Well, people are not supposed to fill
 | 
						|
			// 100% of disk space anyway.
 | 
						|
			write_bytes =
 | 
						|
			    min_t(size_t, count,
 | 
						|
				  inode->i_sb->s_blocksize -
 | 
						|
				  (pos & (inode->i_sb->s_blocksize - 1)));
 | 
						|
			num_pages = 1;
 | 
						|
			// No blocks were claimed before, so do it now.
 | 
						|
			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
 | 
						|
							      1 <<
 | 
						|
							      (PAGE_CACHE_SHIFT
 | 
						|
							       -
 | 
						|
							       inode->
 | 
						|
							       i_blkbits));
 | 
						|
		}
 | 
						|
 | 
						|
		/* Prepare for writing into the region, read in all the
 | 
						|
		   partially overwritten pages, if needed. And lock the pages,
 | 
						|
		   so that nobody else can access these until we are done.
 | 
						|
		   We get number of actual blocks needed as a result. */
 | 
						|
		res = reiserfs_prepare_file_region_for_write(inode, pos,
 | 
						|
							     num_pages,
 | 
						|
							     write_bytes,
 | 
						|
							     prepared_pages);
 | 
						|
		if (res < 0) {
 | 
						|
			reiserfs_release_claimed_blocks(inode->i_sb,
 | 
						|
							num_pages <<
 | 
						|
							(PAGE_CACHE_SHIFT -
 | 
						|
							 inode->i_blkbits));
 | 
						|
			break;
 | 
						|
		}
 | 
						|
 | 
						|
		blocks_to_allocate = res;
 | 
						|
 | 
						|
		/* First we correct our estimate of how many blocks we need */
 | 
						|
		reiserfs_release_claimed_blocks(inode->i_sb,
 | 
						|
						(num_pages <<
 | 
						|
						 (PAGE_CACHE_SHIFT -
 | 
						|
						  inode->i_sb->
 | 
						|
						  s_blocksize_bits)) -
 | 
						|
						blocks_to_allocate);
 | 
						|
 | 
						|
		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
 | 
						|
			/* Fill in all the possible holes and append the file if needed */
 | 
						|
			res =
 | 
						|
			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
 | 
						|
								num_pages,
 | 
						|
								write_bytes,
 | 
						|
								prepared_pages,
 | 
						|
								blocks_to_allocate);
 | 
						|
		}
 | 
						|
 | 
						|
		/* well, we have allocated the blocks, so it is time to free
 | 
						|
		   the reservation we made earlier. */
 | 
						|
		reiserfs_release_claimed_blocks(inode->i_sb,
 | 
						|
						blocks_to_allocate);
 | 
						|
		if (res) {
 | 
						|
			reiserfs_unprepare_pages(prepared_pages, num_pages);
 | 
						|
			break;
 | 
						|
		}
 | 
						|
 | 
						|
/* NOTE that allocating blocks and filling blocks can be done in reverse order
 | 
						|
   and probably we would do that just to get rid of garbage in files after a
 | 
						|
   crash */
 | 
						|
 | 
						|
		/* Copy data from user-supplied buffer to file's pages */
 | 
						|
		res =
 | 
						|
		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
 | 
						|
							   write_bytes,
 | 
						|
							   prepared_pages, buf);
 | 
						|
		if (res) {
 | 
						|
			reiserfs_unprepare_pages(prepared_pages, num_pages);
 | 
						|
			break;
 | 
						|
		}
 | 
						|
 | 
						|
		/* Send the pages to disk and unlock them. */
 | 
						|
		res =
 | 
						|
		    reiserfs_submit_file_region_for_write(&th, inode, pos,
 | 
						|
							  num_pages,
 | 
						|
							  write_bytes,
 | 
						|
							  prepared_pages);
 | 
						|
		if (res)
 | 
						|
			break;
 | 
						|
 | 
						|
		already_written += write_bytes;
 | 
						|
		buf += write_bytes;
 | 
						|
		*ppos = pos += write_bytes;
 | 
						|
		count -= write_bytes;
 | 
						|
		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
 | 
						|
	}
 | 
						|
 | 
						|
	/* this is only true on error */
 | 
						|
	if (th.t_trans_id) {
 | 
						|
		reiserfs_write_lock(inode->i_sb);
 | 
						|
		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
 | 
						|
		reiserfs_write_unlock(inode->i_sb);
 | 
						|
		if (err) {
 | 
						|
			res = err;
 | 
						|
			goto out;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (likely(res >= 0) &&
 | 
						|
	    (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))))
 | 
						|
		res = generic_osync_inode(inode, file->f_mapping,
 | 
						|
		                          OSYNC_METADATA | OSYNC_DATA);
 | 
						|
 | 
						|
	mutex_unlock(&inode->i_mutex);
 | 
						|
	reiserfs_async_progress_wait(inode->i_sb);
 | 
						|
	return (already_written != 0) ? already_written : res;
 | 
						|
 | 
						|
      out:
 | 
						|
	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
 | 
						|
	return res;
 | 
						|
}
 | 
						|
 | 
						|
static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
 | 
						|
				  size_t count, loff_t pos)
 | 
						|
{
 | 
						|
	return generic_file_aio_write(iocb, buf, count, pos);
 | 
						|
}
 | 
						|
 | 
						|
const struct file_operations reiserfs_file_operations = {
 | 
						|
	.read = generic_file_read,
 | 
						|
	.write = reiserfs_file_write,
 | 
						|
	.ioctl = reiserfs_ioctl,
 | 
						|
	.mmap = generic_file_mmap,
 | 
						|
	.release = reiserfs_file_release,
 | 
						|
	.fsync = reiserfs_sync_file,
 | 
						|
	.sendfile = generic_file_sendfile,
 | 
						|
	.aio_read = generic_file_aio_read,
 | 
						|
	.aio_write = reiserfs_aio_write,
 | 
						|
	.splice_read = generic_file_splice_read,
 | 
						|
	.splice_write = generic_file_splice_write,
 | 
						|
};
 | 
						|
 | 
						|
struct inode_operations reiserfs_file_inode_operations = {
 | 
						|
	.truncate = reiserfs_vfs_truncate_file,
 | 
						|
	.setattr = reiserfs_setattr,
 | 
						|
	.setxattr = reiserfs_setxattr,
 | 
						|
	.getxattr = reiserfs_getxattr,
 | 
						|
	.listxattr = reiserfs_listxattr,
 | 
						|
	.removexattr = reiserfs_removexattr,
 | 
						|
	.permission = reiserfs_permission,
 | 
						|
};
 |