| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * proc/fs/generic.c --- generic routines for the proc-fs | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This file contains generic proc-fs routines for handling | 
					
						
							|  |  |  |  * directories and files. | 
					
						
							|  |  |  |  *  | 
					
						
							|  |  |  |  * Copyright (C) 1991, 1992 Linus Torvalds. | 
					
						
							|  |  |  |  * Copyright (C) 1997 Theodore Ts'o | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include <linux/errno.h>
 | 
					
						
							|  |  |  | #include <linux/time.h>
 | 
					
						
							|  |  |  | #include <linux/proc_fs.h>
 | 
					
						
							|  |  |  | #include <linux/stat.h>
 | 
					
						
							|  |  |  | #include <linux/module.h>
 | 
					
						
							|  |  |  | #include <linux/mount.h>
 | 
					
						
							|  |  |  | #include <linux/smp_lock.h>
 | 
					
						
							|  |  |  | #include <linux/init.h>
 | 
					
						
							|  |  |  | #include <linux/idr.h>
 | 
					
						
							|  |  |  | #include <linux/namei.h>
 | 
					
						
							|  |  |  | #include <linux/bitops.h>
 | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | #include <linux/spinlock.h>
 | 
					
						
							| 
									
										
											  
											
												Fix rmmod/read/write races in /proc entries
Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.
   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()
  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */
NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.
NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-07-15 23:39:00 -07:00
										 |  |  | #include <linux/completion.h>
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #include <asm/uaccess.h>
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-01-08 01:04:16 -08:00
										 |  |  | #include "internal.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | DEFINE_SPINLOCK(proc_subdir_lock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-02-14 00:34:12 -08:00
										 |  |  | static int proc_match(int len, const char *name, struct proc_dir_entry *de) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | { | 
					
						
							|  |  |  | 	if (de->namelen != len) | 
					
						
							|  |  |  | 		return 0; | 
					
						
							|  |  |  | 	return !memcmp(name, de->name, len); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* buffer size is one page but our output routines use some slack for overruns */ | 
					
						
							|  |  |  | #define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static ssize_t | 
					
						
							|  |  |  | proc_file_read(struct file *file, char __user *buf, size_t nbytes, | 
					
						
							|  |  |  | 	       loff_t *ppos) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2006-12-08 02:36:36 -08:00
										 |  |  | 	struct inode * inode = file->f_path.dentry->d_inode; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	char 	*page; | 
					
						
							|  |  |  | 	ssize_t	retval=0; | 
					
						
							|  |  |  | 	int	eof=0; | 
					
						
							|  |  |  | 	ssize_t	n, count; | 
					
						
							|  |  |  | 	char	*start; | 
					
						
							|  |  |  | 	struct proc_dir_entry * dp; | 
					
						
							| 
									
										
										
										
											2005-12-30 08:39:10 -08:00
										 |  |  | 	unsigned long long pos; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Gaah, please just use "seq_file" instead. The legacy /proc | 
					
						
							|  |  |  | 	 * interfaces cut loff_t down to off_t for reads, and ignore | 
					
						
							|  |  |  | 	 * the offset entirely for writes.. | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	pos = *ppos; | 
					
						
							|  |  |  | 	if (pos > MAX_NON_LFS) | 
					
						
							|  |  |  | 		return 0; | 
					
						
							|  |  |  | 	if (nbytes > MAX_NON_LFS - pos) | 
					
						
							|  |  |  | 		nbytes = MAX_NON_LFS - pos; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	dp = PDE(inode); | 
					
						
							| 
									
										
										
										
											2007-10-16 01:25:52 -07:00
										 |  |  | 	if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		return -ENOMEM; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	while ((nbytes > 0) && !eof) { | 
					
						
							|  |  |  | 		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		start = NULL; | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:58 -07:00
										 |  |  | 		if (dp->read_proc) { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 			/*
 | 
					
						
							|  |  |  | 			 * How to be a proc read function | 
					
						
							|  |  |  | 			 * ------------------------------ | 
					
						
							|  |  |  | 			 * Prototype: | 
					
						
							|  |  |  | 			 *    int f(char *buffer, char **start, off_t offset, | 
					
						
							|  |  |  | 			 *          int count, int *peof, void *dat) | 
					
						
							|  |  |  | 			 * | 
					
						
							|  |  |  | 			 * Assume that the buffer is "count" bytes in size. | 
					
						
							|  |  |  | 			 * | 
					
						
							|  |  |  | 			 * If you know you have supplied all the data you | 
					
						
							|  |  |  | 			 * have, set *peof. | 
					
						
							|  |  |  | 			 * | 
					
						
							|  |  |  | 			 * You have three ways to return data: | 
					
						
							|  |  |  | 			 * 0) Leave *start = NULL.  (This is the default.) | 
					
						
							|  |  |  | 			 *    Put the data of the requested offset at that | 
					
						
							|  |  |  | 			 *    offset within the buffer.  Return the number (n) | 
					
						
							|  |  |  | 			 *    of bytes there are from the beginning of the | 
					
						
							|  |  |  | 			 *    buffer up to the last byte of data.  If the | 
					
						
							|  |  |  | 			 *    number of supplied bytes (= n - offset) is  | 
					
						
							|  |  |  | 			 *    greater than zero and you didn't signal eof | 
					
						
							|  |  |  | 			 *    and the reader is prepared to take more data | 
					
						
							|  |  |  | 			 *    you will be called again with the requested | 
					
						
							|  |  |  | 			 *    offset advanced by the number of bytes  | 
					
						
							|  |  |  | 			 *    absorbed.  This interface is useful for files | 
					
						
							|  |  |  | 			 *    no larger than the buffer. | 
					
						
							|  |  |  | 			 * 1) Set *start = an unsigned long value less than | 
					
						
							|  |  |  | 			 *    the buffer address but greater than zero. | 
					
						
							|  |  |  | 			 *    Put the data of the requested offset at the | 
					
						
							|  |  |  | 			 *    beginning of the buffer.  Return the number of | 
					
						
							|  |  |  | 			 *    bytes of data placed there.  If this number is | 
					
						
							|  |  |  | 			 *    greater than zero and you didn't signal eof | 
					
						
							|  |  |  | 			 *    and the reader is prepared to take more data | 
					
						
							|  |  |  | 			 *    you will be called again with the requested | 
					
						
							|  |  |  | 			 *    offset advanced by *start.  This interface is | 
					
						
							|  |  |  | 			 *    useful when you have a large file consisting | 
					
						
							|  |  |  | 			 *    of a series of blocks which you want to count | 
					
						
							|  |  |  | 			 *    and return as wholes. | 
					
						
							|  |  |  | 			 *    (Hack by Paul.Russell@rustcorp.com.au) | 
					
						
							|  |  |  | 			 * 2) Set *start = an address within the buffer. | 
					
						
							|  |  |  | 			 *    Put the data of the requested offset at *start. | 
					
						
							|  |  |  | 			 *    Return the number of bytes of data placed there. | 
					
						
							|  |  |  | 			 *    If this number is greater than zero and you | 
					
						
							|  |  |  | 			 *    didn't signal eof and the reader is prepared to | 
					
						
							|  |  |  | 			 *    take more data you will be called again with the | 
					
						
							|  |  |  | 			 *    requested offset advanced by the number of bytes | 
					
						
							|  |  |  | 			 *    absorbed. | 
					
						
							|  |  |  | 			 */ | 
					
						
							|  |  |  | 			n = dp->read_proc(page, &start, *ppos, | 
					
						
							|  |  |  | 					  count, &eof, dp->data); | 
					
						
							|  |  |  | 		} else | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if (n == 0)   /* end of file */ | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 		if (n < 0) {  /* error */ | 
					
						
							|  |  |  | 			if (retval == 0) | 
					
						
							|  |  |  | 				retval = n; | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if (start == NULL) { | 
					
						
							|  |  |  | 			if (n > PAGE_SIZE) { | 
					
						
							|  |  |  | 				printk(KERN_ERR | 
					
						
							|  |  |  | 				       "proc_file_read: Apparent buffer overflow!\n"); | 
					
						
							|  |  |  | 				n = PAGE_SIZE; | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			n -= *ppos; | 
					
						
							|  |  |  | 			if (n <= 0) | 
					
						
							|  |  |  | 				break; | 
					
						
							|  |  |  | 			if (n > count) | 
					
						
							|  |  |  | 				n = count; | 
					
						
							|  |  |  | 			start = page + *ppos; | 
					
						
							|  |  |  | 		} else if (start < page) { | 
					
						
							|  |  |  | 			if (n > PAGE_SIZE) { | 
					
						
							|  |  |  | 				printk(KERN_ERR | 
					
						
							|  |  |  | 				       "proc_file_read: Apparent buffer overflow!\n"); | 
					
						
							|  |  |  | 				n = PAGE_SIZE; | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if (n > count) { | 
					
						
							|  |  |  | 				/*
 | 
					
						
							|  |  |  | 				 * Don't reduce n because doing so might | 
					
						
							|  |  |  | 				 * cut off part of a data block. | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 				printk(KERN_WARNING | 
					
						
							|  |  |  | 				       "proc_file_read: Read count exceeded\n"); | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} else /* start >= page */ { | 
					
						
							|  |  |  | 			unsigned long startoff = (unsigned long)(start - page); | 
					
						
							|  |  |  | 			if (n > (PAGE_SIZE - startoff)) { | 
					
						
							|  |  |  | 				printk(KERN_ERR | 
					
						
							|  |  |  | 				       "proc_file_read: Apparent buffer overflow!\n"); | 
					
						
							|  |  |  | 				n = PAGE_SIZE - startoff; | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if (n > count) | 
					
						
							|  |  |  | 				n = count; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		 | 
					
						
							|  |  |  |  		n -= copy_to_user(buf, start < page ? page : start, n); | 
					
						
							|  |  |  | 		if (n == 0) { | 
					
						
							|  |  |  | 			if (retval == 0) | 
					
						
							|  |  |  | 				retval = -EFAULT; | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		*ppos += start < page ? (unsigned long)start : n; | 
					
						
							|  |  |  | 		nbytes -= n; | 
					
						
							|  |  |  | 		buf += n; | 
					
						
							|  |  |  | 		retval += n; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	free_page((unsigned long) page); | 
					
						
							|  |  |  | 	return retval; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static ssize_t | 
					
						
							|  |  |  | proc_file_write(struct file *file, const char __user *buffer, | 
					
						
							|  |  |  | 		size_t count, loff_t *ppos) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2006-12-08 02:36:36 -08:00
										 |  |  | 	struct inode *inode = file->f_path.dentry->d_inode; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	struct proc_dir_entry * dp; | 
					
						
							|  |  |  | 	 | 
					
						
							|  |  |  | 	dp = PDE(inode); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (!dp->write_proc) | 
					
						
							|  |  |  | 		return -EIO; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* FIXME: does this routine need ppos?  probably... */ | 
					
						
							|  |  |  | 	return dp->write_proc(file, buffer, count, dp->data); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static loff_t | 
					
						
							|  |  |  | proc_file_lseek(struct file *file, loff_t offset, int orig) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2005-12-30 08:39:10 -08:00
										 |  |  | 	loff_t retval = -EINVAL; | 
					
						
							|  |  |  | 	switch (orig) { | 
					
						
							|  |  |  | 	case 1: | 
					
						
							|  |  |  | 		offset += file->f_pos; | 
					
						
							|  |  |  | 	/* fallthrough */ | 
					
						
							|  |  |  | 	case 0: | 
					
						
							|  |  |  | 		if (offset < 0 || offset > MAX_NON_LFS) | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 		file->f_pos = retval = offset; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return retval; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:27 -08:00
										 |  |  | static const struct file_operations proc_file_operations = { | 
					
						
							|  |  |  | 	.llseek		= proc_file_lseek, | 
					
						
							|  |  |  | 	.read		= proc_file_read, | 
					
						
							|  |  |  | 	.write		= proc_file_write, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct inode *inode = dentry->d_inode; | 
					
						
							|  |  |  | 	struct proc_dir_entry *de = PDE(inode); | 
					
						
							|  |  |  | 	int error; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	error = inode_change_ok(inode, iattr); | 
					
						
							|  |  |  | 	if (error) | 
					
						
							|  |  |  | 		goto out; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	error = inode_setattr(inode, iattr); | 
					
						
							|  |  |  | 	if (error) | 
					
						
							|  |  |  | 		goto out; | 
					
						
							|  |  |  | 	 | 
					
						
							|  |  |  | 	de->uid = inode->i_uid; | 
					
						
							|  |  |  | 	de->gid = inode->i_gid; | 
					
						
							|  |  |  | 	de->mode = inode->i_mode; | 
					
						
							|  |  |  | out: | 
					
						
							|  |  |  | 	return error; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-09-06 15:17:18 -07:00
										 |  |  | static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, | 
					
						
							|  |  |  | 			struct kstat *stat) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct inode *inode = dentry->d_inode; | 
					
						
							|  |  |  | 	struct proc_dir_entry *de = PROC_I(inode)->pde; | 
					
						
							|  |  |  | 	if (de && de->nlink) | 
					
						
							|  |  |  | 		inode->i_nlink = de->nlink; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	generic_fillattr(inode, stat); | 
					
						
							|  |  |  | 	return 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-02-12 00:55:40 -08:00
										 |  |  | static const struct inode_operations proc_file_inode_operations = { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	.setattr	= proc_notify_change, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * This function parses a name such as "tty/driver/serial", and | 
					
						
							|  |  |  |  * returns the struct proc_dir_entry for "/proc/tty/driver", and | 
					
						
							|  |  |  |  * returns "serial" in residual. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | static int xlate_proc_name(const char *name, | 
					
						
							|  |  |  | 			   struct proc_dir_entry **ret, const char **residual) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	const char     		*cp = name, *next; | 
					
						
							|  |  |  | 	struct proc_dir_entry	*de; | 
					
						
							|  |  |  | 	int			len; | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 	int 			rtn = 0; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												proc: less special case in xlate code
If valid "parent" is passed to proc_create/remove_proc_entry(), then name of
PDE should consist of only one path component, otherwise creation or or
removal will fail.  However, if NULL is passed as parent then create/remove
accept full path as a argument.  This is arbitrary restriction -- all
infrastructure is in place.
So, patch allows the following to succeed:
	create_proc_entry("foo/bar", 0, pde_baz);
	remove_proc_entry("baz/foo/bar", &proc_root);
Also makes the following to behave identically:
	create_proc_entry("foo/bar", 0, NULL);
	create_proc_entry("foo/bar", 0, &proc_root);
Discrepancy noticed by Den Lunev (IIRC).
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-04-29 01:01:40 -07:00
										 |  |  | 	de = *ret; | 
					
						
							|  |  |  | 	if (!de) | 
					
						
							|  |  |  | 		de = &proc_root; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 	spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	while (1) { | 
					
						
							|  |  |  | 		next = strchr(cp, '/'); | 
					
						
							|  |  |  | 		if (!next) | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		len = next - cp; | 
					
						
							|  |  |  | 		for (de = de->subdir; de ; de = de->next) { | 
					
						
							|  |  |  | 			if (proc_match(len, cp, de)) | 
					
						
							|  |  |  | 				break; | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 		if (!de) { | 
					
						
							|  |  |  | 			rtn = -ENOENT; | 
					
						
							|  |  |  | 			goto out; | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		cp += len + 1; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	*residual = cp; | 
					
						
							|  |  |  | 	*ret = de; | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | out: | 
					
						
							|  |  |  | 	spin_unlock(&proc_subdir_lock); | 
					
						
							|  |  |  | 	return rtn; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-07-26 11:21:37 +04:00
										 |  |  | static DEFINE_IDA(proc_inum_ida); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-07-26 11:18:28 +04:00
										 |  |  | #define PROC_DYNAMIC_FIRST 0xF0000000U
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Return an inode number between PROC_DYNAMIC_FIRST and | 
					
						
							|  |  |  |  * 0xffffffff, or zero on failure. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | static unsigned int get_inode_number(void) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-07-26 11:18:28 +04:00
										 |  |  | 	unsigned int i; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	int error; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | retry: | 
					
						
							| 
									
										
										
										
											2008-07-26 11:21:37 +04:00
										 |  |  | 	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		return 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	spin_lock(&proc_inum_lock); | 
					
						
							| 
									
										
										
										
											2008-07-26 11:21:37 +04:00
										 |  |  | 	error = ida_get_new(&proc_inum_ida, &i); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	spin_unlock(&proc_inum_lock); | 
					
						
							|  |  |  | 	if (error == -EAGAIN) | 
					
						
							|  |  |  | 		goto retry; | 
					
						
							|  |  |  | 	else if (error) | 
					
						
							|  |  |  | 		return 0; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-07-26 11:18:28 +04:00
										 |  |  | 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { | 
					
						
							|  |  |  | 		spin_lock(&proc_inum_lock); | 
					
						
							| 
									
										
										
										
											2008-07-26 11:21:37 +04:00
										 |  |  | 		ida_remove(&proc_inum_ida, i); | 
					
						
							| 
									
										
										
										
											2008-07-26 11:18:28 +04:00
										 |  |  | 		spin_unlock(&proc_inum_lock); | 
					
						
							| 
									
										
										
										
											2008-08-02 07:30:48 +04:00
										 |  |  | 		return 0; | 
					
						
							| 
									
										
										
										
											2008-07-26 11:18:28 +04:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	return PROC_DYNAMIC_FIRST + i; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void release_inode_number(unsigned int inum) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	spin_lock(&proc_inum_lock); | 
					
						
							| 
									
										
										
										
											2008-07-26 11:21:37 +04:00
										 |  |  | 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	spin_unlock(&proc_inum_lock); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
											
												[PATCH] Fix up symlink function pointers
This fixes up the symlink functions for the calling convention change:
 * afs, autofs4, befs, devfs, freevxfs, jffs2, jfs, ncpfs, procfs,
   smbfs, sysvfs, ufs, xfs - prototype change for ->follow_link()
 * befs, smbfs, xfs - same for ->put_link()
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
											
										 
											2005-08-20 00:17:39 +01:00
										 |  |  | static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | { | 
					
						
							|  |  |  | 	nd_set_link(nd, PDE(dentry->d_inode)->data); | 
					
						
							| 
									
										
										
											
												[PATCH] Fix up symlink function pointers
This fixes up the symlink functions for the calling convention change:
 * afs, autofs4, befs, devfs, freevxfs, jffs2, jfs, ncpfs, procfs,
   smbfs, sysvfs, ufs, xfs - prototype change for ->follow_link()
 * befs, smbfs, xfs - same for ->put_link()
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
											
										 
											2005-08-20 00:17:39 +01:00
										 |  |  | 	return NULL; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-02-12 00:55:40 -08:00
										 |  |  | static const struct inode_operations proc_link_inode_operations = { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	.readlink	= generic_readlink, | 
					
						
							|  |  |  | 	.follow_link	= proc_follow_link, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * As some entries in /proc are volatile, we want to  | 
					
						
							|  |  |  |  * get rid of unused dentries.  This could be made  | 
					
						
							|  |  |  |  * smarter: we could keep a "volatile" flag in the  | 
					
						
							|  |  |  |  * inode to indicate which ones to keep. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | static int proc_delete_dentry(struct dentry * dentry) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	return 1; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static struct dentry_operations proc_dentry_operations = | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	.d_delete	= proc_delete_dentry, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Don't create negative dentries here, return -ENOENT by hand | 
					
						
							|  |  |  |  * instead. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
											  
											
												[NET]: Make /proc/net a symlink on /proc/self/net (v3)
Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.
The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.
The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.
# ls -l /proc/net
lrwxrwxrwx  1 root root 8 Mar  5 15:17 /proc/net -> self/net
In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.
Changes from v2:
* Fixed discrepancy of /proc/net nlink count and selinux labeling
  screwup pointed out by Stephen.
  To get the correct nlink count the ->getattr callback for /proc/net
  is overridden to read one from the net->proc_net entry.
  To make selinux still work the net->proc_net entry is initialized
  properly, i.e. with the "net" name and the proc_net parent.
Selinux fixes are
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Changes from v1:
* Fixed a task_struct leak in get_proc_task_net, pointed out by Paul.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2008-03-07 11:08:40 -08:00
										 |  |  | struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | 
					
						
							|  |  |  | 		struct dentry *dentry) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | { | 
					
						
							|  |  |  | 	struct inode *inode = NULL; | 
					
						
							|  |  |  | 	int error = -ENOENT; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	lock_kernel(); | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 	spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:41 -07:00
										 |  |  | 	for (de = de->subdir; de ; de = de->next) { | 
					
						
							|  |  |  | 		if (de->namelen != dentry->d_name.len) | 
					
						
							|  |  |  | 			continue; | 
					
						
							|  |  |  | 		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { | 
					
						
							|  |  |  | 			unsigned int ino; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			ino = de->low_ino; | 
					
						
							|  |  |  | 			de_get(de); | 
					
						
							|  |  |  | 			spin_unlock(&proc_subdir_lock); | 
					
						
							|  |  |  | 			error = -EINVAL; | 
					
						
							|  |  |  | 			inode = proc_get_inode(dir->i_sb, ino, de); | 
					
						
							|  |  |  | 			goto out_unlock; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 	spin_unlock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:27 -08:00
										 |  |  | out_unlock: | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unlock_kernel(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (inode) { | 
					
						
							|  |  |  | 		dentry->d_op = &proc_dentry_operations; | 
					
						
							|  |  |  | 		d_add(dentry, inode); | 
					
						
							|  |  |  | 		return NULL; | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:41 -07:00
										 |  |  | 	if (de) | 
					
						
							|  |  |  | 		de_put(de); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	return ERR_PTR(error); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												[NET]: Make /proc/net a symlink on /proc/self/net (v3)
Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.
The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.
The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.
# ls -l /proc/net
lrwxrwxrwx  1 root root 8 Mar  5 15:17 /proc/net -> self/net
In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.
Changes from v2:
* Fixed discrepancy of /proc/net nlink count and selinux labeling
  screwup pointed out by Stephen.
  To get the correct nlink count the ->getattr callback for /proc/net
  is overridden to read one from the net->proc_net entry.
  To make selinux still work the net->proc_net entry is initialized
  properly, i.e. with the "net" name and the proc_net parent.
Selinux fixes are
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Changes from v1:
* Fixed a task_struct leak in get_proc_task_net, pointed out by Paul.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2008-03-07 11:08:40 -08:00
										 |  |  | struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, | 
					
						
							|  |  |  | 		struct nameidata *nd) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	return proc_lookup_de(PDE(dir), dir, dentry); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * This returns non-zero if at EOF, so that the /proc | 
					
						
							|  |  |  |  * root directory can use this and check if it should | 
					
						
							|  |  |  |  * continue with the <pid> entries.. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Note that the VFS-layer doesn't care about the return | 
					
						
							|  |  |  |  * value of the readdir() call, as long as it's non-negative | 
					
						
							|  |  |  |  * for success.. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
											  
											
												[NET]: Make /proc/net a symlink on /proc/self/net (v3)
Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.
The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.
The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.
# ls -l /proc/net
lrwxrwxrwx  1 root root 8 Mar  5 15:17 /proc/net -> self/net
In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.
Changes from v2:
* Fixed discrepancy of /proc/net nlink count and selinux labeling
  screwup pointed out by Stephen.
  To get the correct nlink count the ->getattr callback for /proc/net
  is overridden to read one from the net->proc_net entry.
  To make selinux still work the net->proc_net entry is initialized
  properly, i.e. with the "net" name and the proc_net parent.
Selinux fixes are
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Changes from v1:
* Fixed a task_struct leak in get_proc_task_net, pointed out by Paul.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2008-03-07 11:08:40 -08:00
										 |  |  | int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, | 
					
						
							|  |  |  | 		filldir_t filldir) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | { | 
					
						
							|  |  |  | 	unsigned int ino; | 
					
						
							|  |  |  | 	int i; | 
					
						
							| 
									
										
										
										
											2006-12-08 02:36:36 -08:00
										 |  |  | 	struct inode *inode = filp->f_path.dentry->d_inode; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	int ret = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	lock_kernel(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	ino = inode->i_ino; | 
					
						
							|  |  |  | 	i = filp->f_pos; | 
					
						
							|  |  |  | 	switch (i) { | 
					
						
							|  |  |  | 		case 0: | 
					
						
							|  |  |  | 			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) | 
					
						
							|  |  |  | 				goto out; | 
					
						
							|  |  |  | 			i++; | 
					
						
							|  |  |  | 			filp->f_pos++; | 
					
						
							|  |  |  | 			/* fall through */ | 
					
						
							|  |  |  | 		case 1: | 
					
						
							|  |  |  | 			if (filldir(dirent, "..", 2, i, | 
					
						
							| 
									
										
										
										
											2006-12-08 02:36:36 -08:00
										 |  |  | 				    parent_ino(filp->f_path.dentry), | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 				    DT_DIR) < 0) | 
					
						
							|  |  |  | 				goto out; | 
					
						
							|  |  |  | 			i++; | 
					
						
							|  |  |  | 			filp->f_pos++; | 
					
						
							|  |  |  | 			/* fall through */ | 
					
						
							|  |  |  | 		default: | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 			spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 			de = de->subdir; | 
					
						
							|  |  |  | 			i -= 2; | 
					
						
							|  |  |  | 			for (;;) { | 
					
						
							|  |  |  | 				if (!de) { | 
					
						
							|  |  |  | 					ret = 1; | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 					spin_unlock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 					goto out; | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				if (!i) | 
					
						
							|  |  |  | 					break; | 
					
						
							|  |  |  | 				de = de->next; | 
					
						
							|  |  |  | 				i--; | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			do { | 
					
						
							| 
									
										
										
										
											2007-05-08 00:25:47 -07:00
										 |  |  | 				struct proc_dir_entry *next; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 				/* filldir passes info to user space */ | 
					
						
							| 
									
										
										
										
											2007-05-08 00:25:47 -07:00
										 |  |  | 				de_get(de); | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 				spin_unlock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 				if (filldir(dirent, de->name, de->namelen, filp->f_pos, | 
					
						
							| 
									
										
										
										
											2007-05-08 00:25:47 -07:00
										 |  |  | 					    de->low_ino, de->mode >> 12) < 0) { | 
					
						
							|  |  |  | 					de_put(de); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 					goto out; | 
					
						
							| 
									
										
										
										
											2007-05-08 00:25:47 -07:00
										 |  |  | 				} | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 				spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 				filp->f_pos++; | 
					
						
							| 
									
										
										
										
											2007-05-08 00:25:47 -07:00
										 |  |  | 				next = de->next; | 
					
						
							|  |  |  | 				de_put(de); | 
					
						
							|  |  |  | 				de = next; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 			} while (de); | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 			spin_unlock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	ret = 1; | 
					
						
							|  |  |  | out:	unlock_kernel(); | 
					
						
							|  |  |  | 	return ret;	 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												[NET]: Make /proc/net a symlink on /proc/self/net (v3)
Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.
The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.
The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.
# ls -l /proc/net
lrwxrwxrwx  1 root root 8 Mar  5 15:17 /proc/net -> self/net
In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.
Changes from v2:
* Fixed discrepancy of /proc/net nlink count and selinux labeling
  screwup pointed out by Stephen.
  To get the correct nlink count the ->getattr callback for /proc/net
  is overridden to read one from the net->proc_net entry.
  To make selinux still work the net->proc_net entry is initialized
  properly, i.e. with the "net" name and the proc_net parent.
Selinux fixes are
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Changes from v1:
* Fixed a task_struct leak in get_proc_task_net, pointed out by Paul.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2008-03-07 11:08:40 -08:00
										 |  |  | int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct inode *inode = filp->f_path.dentry->d_inode; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return proc_readdir_de(PDE(inode), filp, dirent, filldir); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * These are the generic /proc directory operations. They | 
					
						
							|  |  |  |  * use the in-memory "struct proc_dir_entry" tree to parse | 
					
						
							|  |  |  |  * the /proc directory. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2007-02-12 00:55:34 -08:00
										 |  |  | static const struct file_operations proc_dir_operations = { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	.read			= generic_read_dir, | 
					
						
							|  |  |  | 	.readdir		= proc_readdir, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * proc directories can do almost nothing.. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2007-02-12 00:55:40 -08:00
										 |  |  | static const struct inode_operations proc_dir_inode_operations = { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	.lookup		= proc_lookup, | 
					
						
							| 
									
										
										
										
											2005-09-06 15:17:18 -07:00
										 |  |  | 	.getattr	= proc_getattr, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	.setattr	= proc_notify_change, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned int i; | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:29 -08:00
										 |  |  | 	struct proc_dir_entry *tmp; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	 | 
					
						
							|  |  |  | 	i = get_inode_number(); | 
					
						
							|  |  |  | 	if (i == 0) | 
					
						
							|  |  |  | 		return -EAGAIN; | 
					
						
							|  |  |  | 	dp->low_ino = i; | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	if (S_ISDIR(dp->mode)) { | 
					
						
							|  |  |  | 		if (dp->proc_iops == NULL) { | 
					
						
							|  |  |  | 			dp->proc_fops = &proc_dir_operations; | 
					
						
							|  |  |  | 			dp->proc_iops = &proc_dir_inode_operations; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		dir->nlink++; | 
					
						
							|  |  |  | 	} else if (S_ISLNK(dp->mode)) { | 
					
						
							|  |  |  | 		if (dp->proc_iops == NULL) | 
					
						
							|  |  |  | 			dp->proc_iops = &proc_link_inode_operations; | 
					
						
							|  |  |  | 	} else if (S_ISREG(dp->mode)) { | 
					
						
							|  |  |  | 		if (dp->proc_fops == NULL) | 
					
						
							|  |  |  | 			dp->proc_fops = &proc_file_operations; | 
					
						
							|  |  |  | 		if (dp->proc_iops == NULL) | 
					
						
							|  |  |  | 			dp->proc_iops = &proc_file_inode_operations; | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2007-07-15 23:40:09 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:29 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	for (tmp = dir->subdir; tmp; tmp = tmp->next) | 
					
						
							|  |  |  | 		if (strcmp(tmp->name, dp->name) == 0) { | 
					
						
							| 
									
										
										
										
											2008-09-13 19:51:30 -07:00
										 |  |  | 			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", | 
					
						
							| 
									
										
										
										
											2008-09-13 02:33:06 -07:00
										 |  |  | 				dir->name, dp->name); | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:29 -08:00
										 |  |  | 			break; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-07-15 23:40:09 -07:00
										 |  |  | 	dp->next = dir->subdir; | 
					
						
							|  |  |  | 	dp->parent = dir; | 
					
						
							|  |  |  | 	dir->subdir = dp; | 
					
						
							|  |  |  | 	spin_unlock(&proc_subdir_lock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	return 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 					  const char *name, | 
					
						
							|  |  |  | 					  mode_t mode, | 
					
						
							|  |  |  | 					  nlink_t nlink) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *ent = NULL; | 
					
						
							|  |  |  | 	const char *fn = name; | 
					
						
							|  |  |  | 	int len; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* make sure name is valid */ | 
					
						
							|  |  |  | 	if (!name || !strlen(name)) goto out; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
											
												proc: less special case in xlate code
If valid "parent" is passed to proc_create/remove_proc_entry(), then name of
PDE should consist of only one path component, otherwise creation or or
removal will fail.  However, if NULL is passed as parent then create/remove
accept full path as a argument.  This is arbitrary restriction -- all
infrastructure is in place.
So, patch allows the following to succeed:
	create_proc_entry("foo/bar", 0, pde_baz);
	remove_proc_entry("baz/foo/bar", &proc_root);
Also makes the following to behave identically:
	create_proc_entry("foo/bar", 0, NULL);
	create_proc_entry("foo/bar", 0, &proc_root);
Discrepancy noticed by Den Lunev (IIRC).
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-04-29 01:01:40 -07:00
										 |  |  | 	if (xlate_proc_name(name, parent, &fn) != 0) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		goto out; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* At this point there must not be any '/' characters beyond *fn */ | 
					
						
							|  |  |  | 	if (strchr(fn, '/')) | 
					
						
							|  |  |  | 		goto out; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	len = strlen(fn); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); | 
					
						
							|  |  |  | 	if (!ent) goto out; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	memset(ent, 0, sizeof(struct proc_dir_entry)); | 
					
						
							|  |  |  | 	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); | 
					
						
							|  |  |  | 	ent->name = ((char *) ent) + sizeof(*ent); | 
					
						
							|  |  |  | 	ent->namelen = len; | 
					
						
							|  |  |  | 	ent->mode = mode; | 
					
						
							|  |  |  | 	ent->nlink = nlink; | 
					
						
							| 
									
										
											  
											
												proc: fix proc_dir_entry refcounting
Creating PDEs with refcount 0 and "deleted" flag has problems (see below).
Switch to usual scheme:
* PDE is created with refcount 1
* every de_get does +1
* every de_put() and remove_proc_entry() do -1
* once refcount reaches 0, PDE is freed.
This elegantly fixes at least two following races (both observed) without
introducing new locks, without abusing old locks, without spreading
lock_kernel():
1) PDE leak
remove_proc_entry			de_put
-----------------			------
			[refcnt = 1]
if (atomic_read(&de->count) == 0)
					if (atomic_dec_and_test(&de->count))
						if (de->deleted)
							/* also not taken! */
							free_proc_entry(de);
else
	de->deleted = 1;
		[refcount=0, deleted=1]
2) use after free
remove_proc_entry			de_put
-----------------			------
			[refcnt = 1]
					if (atomic_dec_and_test(&de->count))
if (atomic_read(&de->count) == 0)
	free_proc_entry(de);
						/* boom! */
						if (de->deleted)
							free_proc_entry(de);
BUG: unable to handle kernel paging request at virtual address 6b6b6b6b
printing eip: c10acdda *pdpt = 00000000338f8001 *pde = 0000000000000000
Oops: 0000 [#1] PREEMPT SMP
Modules linked in: af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 23161, comm: cat Not tainted (2.6.24-rc2-8c0863403f109a43d7000b4646da4818220d501f #4)
EIP: 0060:[<c10acdda>] EFLAGS: 00210097 CPU: 1
EIP is at strnlen+0x6/0x18
EAX: 6b6b6b6b EBX: 6b6b6b6b ECX: 6b6b6b6b EDX: fffffffe
ESI: c128fa3b EDI: f380bf34 EBP: ffffffff ESP: f380be44
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 23161, ti=f380b000 task=f38f2570 task.ti=f380b000)
Stack: c10ac4f0 00000278 c12ce000 f43cd2a8 00000163 00000000 7da86067 00000400
       c128fa20 00896b18 f38325a8 c128fe20 ffffffff 00000000 c11f291e 00000400
       f75be300 c128fa20 f769c9a0 c10ac779 f380bf34 f7bfee70 c1018e6b f380bf34
Call Trace:
 [<c10ac4f0>] vsnprintf+0x2ad/0x49b
 [<c10ac779>] vscnprintf+0x14/0x1f
 [<c1018e6b>] vprintk+0xc5/0x2f9
 [<c10379f1>] handle_fasteoi_irq+0x0/0xab
 [<c1004f44>] do_IRQ+0x9f/0xb7
 [<c117db3b>] preempt_schedule_irq+0x3f/0x5b
 [<c100264e>] need_resched+0x1f/0x21
 [<c10190ba>] printk+0x1b/0x1f
 [<c107c8ad>] de_put+0x3d/0x50
 [<c107c8f8>] proc_delete_inode+0x38/0x41
 [<c107c8c0>] proc_delete_inode+0x0/0x41
 [<c1066298>] generic_delete_inode+0x5e/0xc6
 [<c1065aa9>] iput+0x60/0x62
 [<c1063c8e>] d_kill+0x2d/0x46
 [<c1063fa9>] dput+0xdc/0xe4
 [<c10571a1>] __fput+0xb0/0xcd
 [<c1054e49>] filp_close+0x48/0x4f
 [<c1055ee9>] sys_close+0x67/0xa5
 [<c10026b6>] sysenter_past_esp+0x5f/0x85
=======================
Code: c9 74 0c f2 ae 74 05 bf 01 00 00 00 4f 89 fa 5f 89 d0 c3 85 c9 57 89 c7 89 d0 74 05 f2 ae 75 01 4f 89 f8 5f c3 89 c1 89 c8 eb 06 <80> 38 00 74 07 40 4a 83 fa ff 75 f4 29 c8 c3 90 90 90 57 83 c9
EIP: [<c10acdda>] strnlen+0x6/0x18 SS:ESP 0068:f380be44
Also, remove broken usage of ->deleted from reiserfs: if sget() succeeds,
module is already pinned and remove_proc_entry() can't happen => nobody
can mark PDE deleted.
Dummy proc root in netns code is not marked with refcount 1. AFAICS, we
never get it, it's just for proper /proc/net removal. I double checked
CLONE_NETNS continues to work.
Patch survives many hours of modprobe/rmmod/cat loops without new bugs
which can be attributed to refcounting.
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-12-04 23:45:28 -08:00
										 |  |  | 	atomic_set(&ent->count, 1); | 
					
						
							| 
									
										
											  
											
												Fix rmmod/read/write races in /proc entries
Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.
   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()
  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */
NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.
NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-07-15 23:39:00 -07:00
										 |  |  | 	ent->pde_users = 0; | 
					
						
							|  |  |  | 	spin_lock_init(&ent->pde_unload_lock); | 
					
						
							|  |  |  | 	ent->pde_unload_completion = NULL; | 
					
						
							| 
									
										
										
										
											2008-07-25 01:48:29 -07:00
										 |  |  | 	INIT_LIST_HEAD(&ent->pde_openers); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  |  out: | 
					
						
							|  |  |  | 	return ent; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct proc_dir_entry *proc_symlink(const char *name, | 
					
						
							|  |  |  | 		struct proc_dir_entry *parent, const char *dest) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *ent; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | 	ent = __proc_create(&parent, name, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (ent) { | 
					
						
							|  |  |  | 		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); | 
					
						
							|  |  |  | 		if (ent->data) { | 
					
						
							|  |  |  | 			strcpy((char*)ent->data,dest); | 
					
						
							|  |  |  | 			if (proc_register(parent, ent) < 0) { | 
					
						
							|  |  |  | 				kfree(ent->data); | 
					
						
							|  |  |  | 				kfree(ent); | 
					
						
							|  |  |  | 				ent = NULL; | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			kfree(ent); | 
					
						
							|  |  |  | 			ent = NULL; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return ent; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, | 
					
						
							|  |  |  | 		struct proc_dir_entry *parent) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *ent; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | 	ent = __proc_create(&parent, name, S_IFDIR | mode, 2); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	if (ent) { | 
					
						
							|  |  |  | 		if (proc_register(parent, ent) < 0) { | 
					
						
							|  |  |  | 			kfree(ent); | 
					
						
							|  |  |  | 			ent = NULL; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return ent; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-02 04:12:41 -07:00
										 |  |  | struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, | 
					
						
							|  |  |  | 		struct proc_dir_entry *parent) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *ent; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); | 
					
						
							|  |  |  | 	if (ent) { | 
					
						
							|  |  |  | 		ent->data = net; | 
					
						
							|  |  |  | 		if (proc_register(parent, ent) < 0) { | 
					
						
							|  |  |  | 			kfree(ent); | 
					
						
							|  |  |  | 			ent = NULL; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return ent; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL_GPL(proc_net_mkdir); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | struct proc_dir_entry *proc_mkdir(const char *name, | 
					
						
							|  |  |  | 		struct proc_dir_entry *parent) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, | 
					
						
							|  |  |  | 					 struct proc_dir_entry *parent) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *ent; | 
					
						
							|  |  |  | 	nlink_t nlink; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (S_ISDIR(mode)) { | 
					
						
							|  |  |  | 		if ((mode & S_IALLUGO) == 0) | 
					
						
							|  |  |  | 			mode |= S_IRUGO | S_IXUGO; | 
					
						
							|  |  |  | 		nlink = 2; | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		if ((mode & S_IFMT) == 0) | 
					
						
							|  |  |  | 			mode |= S_IFREG; | 
					
						
							|  |  |  | 		if ((mode & S_IALLUGO) == 0) | 
					
						
							|  |  |  | 			mode |= S_IRUGO; | 
					
						
							|  |  |  | 		nlink = 1; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | 	ent = __proc_create(&parent, name, mode, nlink); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	if (ent) { | 
					
						
							|  |  |  | 		if (proc_register(parent, ent) < 0) { | 
					
						
							|  |  |  | 			kfree(ent); | 
					
						
							|  |  |  | 			ent = NULL; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return ent; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												proc: introduce proc_create_data to setup de->data
This set of patches fixes an proc ->open'less usage due to ->proc_fops flip in
the most part of the kernel code.  The original OOPS is described in the
commit 2d3a4e3666325a9709cc8ea2e88151394e8f20fc:
    Typical PDE creation code looks like:
    	pde = create_proc_entry("foo", 0, NULL);
    	if (pde)
    		pde->proc_fops = &foo_proc_fops;
    Notice that PDE is first created, only then ->proc_fops is set up to
    final value. This is a problem because right after creation
    a) PDE is fully visible in /proc , and
    b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
       possible to ->read without ->open (see one class of oopses below).
    The fix is new API called proc_create() which makes sure ->proc_fops are
    set up before gluing PDE to main tree. Typical new code looks like:
    	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
    	if (!pde)
    		return -ENOMEM;
    Fix most networking users for a start.
    In the long run, create_proc_entry() for regular files will go.
In addition to this, proc_create_data is introduced to fix reading from
proc without PDE->data. The race is basically the same as above.
create_proc_entries is replaced in the entire kernel code as new method
is also simply better.
This patch:
The problem is the same as for de->proc_fops.  Right now PDE becomes visible
without data set.  So, the entry could be looked up without data.  This, in
most cases, will simply OOPS.
proc_create_data call is created to address this issue.  proc_create now
becomes a wrapper around it.
Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Chris Mason <chris.mason@oracle.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Karsten Keil <kkeil@suse.de>
Cc: Kyle McMartin <kyle@parisc-linux.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-04-29 01:02:00 -07:00
										 |  |  | struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, | 
					
						
							|  |  |  | 					struct proc_dir_entry *parent, | 
					
						
							|  |  |  | 					const struct file_operations *proc_fops, | 
					
						
							|  |  |  | 					void *data) | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry *pde; | 
					
						
							|  |  |  | 	nlink_t nlink; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (S_ISDIR(mode)) { | 
					
						
							|  |  |  | 		if ((mode & S_IALLUGO) == 0) | 
					
						
							|  |  |  | 			mode |= S_IRUGO | S_IXUGO; | 
					
						
							|  |  |  | 		nlink = 2; | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		if ((mode & S_IFMT) == 0) | 
					
						
							|  |  |  | 			mode |= S_IFREG; | 
					
						
							|  |  |  | 		if ((mode & S_IALLUGO) == 0) | 
					
						
							|  |  |  | 			mode |= S_IRUGO; | 
					
						
							|  |  |  | 		nlink = 1; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	pde = __proc_create(&parent, name, mode, nlink); | 
					
						
							|  |  |  | 	if (!pde) | 
					
						
							|  |  |  | 		goto out; | 
					
						
							|  |  |  | 	pde->proc_fops = proc_fops; | 
					
						
							| 
									
										
											  
											
												proc: introduce proc_create_data to setup de->data
This set of patches fixes an proc ->open'less usage due to ->proc_fops flip in
the most part of the kernel code.  The original OOPS is described in the
commit 2d3a4e3666325a9709cc8ea2e88151394e8f20fc:
    Typical PDE creation code looks like:
    	pde = create_proc_entry("foo", 0, NULL);
    	if (pde)
    		pde->proc_fops = &foo_proc_fops;
    Notice that PDE is first created, only then ->proc_fops is set up to
    final value. This is a problem because right after creation
    a) PDE is fully visible in /proc , and
    b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
       possible to ->read without ->open (see one class of oopses below).
    The fix is new API called proc_create() which makes sure ->proc_fops are
    set up before gluing PDE to main tree. Typical new code looks like:
    	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
    	if (!pde)
    		return -ENOMEM;
    Fix most networking users for a start.
    In the long run, create_proc_entry() for regular files will go.
In addition to this, proc_create_data is introduced to fix reading from
proc without PDE->data. The race is basically the same as above.
create_proc_entries is replaced in the entire kernel code as new method
is also simply better.
This patch:
The problem is the same as for de->proc_fops.  Right now PDE becomes visible
without data set.  So, the entry could be looked up without data.  This, in
most cases, will simply OOPS.
proc_create_data call is created to address this issue.  proc_create now
becomes a wrapper around it.
Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Chris Mason <chris.mason@oracle.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Karsten Keil <kkeil@suse.de>
Cc: Kyle McMartin <kyle@parisc-linux.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-04-29 01:02:00 -07:00
										 |  |  | 	pde->data = data; | 
					
						
							| 
									
										
											  
											
												proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:
	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;
Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).
The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:
	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;
Fix most networking users for a start.
In the long run, create_proc_entry() for regular files will go.
BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-02-08 04:18:37 -08:00
										 |  |  | 	if (proc_register(parent, pde) < 0) | 
					
						
							|  |  |  | 		goto out_free; | 
					
						
							|  |  |  | 	return pde; | 
					
						
							|  |  |  | out_free: | 
					
						
							|  |  |  | 	kfree(pde); | 
					
						
							|  |  |  | out: | 
					
						
							|  |  |  | 	return NULL; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | void free_proc_entry(struct proc_dir_entry *de) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned int ino = de->low_ino; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (ino < PROC_DYNAMIC_FIRST) | 
					
						
							|  |  |  | 		return; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	release_inode_number(ino); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-02-08 04:18:28 -08:00
										 |  |  | 	if (S_ISLNK(de->mode)) | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 		kfree(de->data); | 
					
						
							|  |  |  | 	kfree(de); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Remove a /proc entry and free it if it's not currently in use. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct proc_dir_entry **p; | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 	struct proc_dir_entry *de = NULL; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	const char *fn = name; | 
					
						
							|  |  |  | 	int len; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
											
												proc: less special case in xlate code
If valid "parent" is passed to proc_create/remove_proc_entry(), then name of
PDE should consist of only one path component, otherwise creation or or
removal will fail.  However, if NULL is passed as parent then create/remove
accept full path as a argument.  This is arbitrary restriction -- all
infrastructure is in place.
So, patch allows the following to succeed:
	create_proc_entry("foo/bar", 0, pde_baz);
	remove_proc_entry("baz/foo/bar", &proc_root);
Also makes the following to behave identically:
	create_proc_entry("foo/bar", 0, NULL);
	create_proc_entry("foo/bar", 0, &proc_root);
Discrepancy noticed by Den Lunev (IIRC).
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2008-04-29 01:01:40 -07:00
										 |  |  | 	if (xlate_proc_name(name, &parent, &fn) != 0) | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 		return; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	len = strlen(fn); | 
					
						
							| 
									
										
										
										
											2006-03-26 01:36:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	spin_lock(&proc_subdir_lock); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	for (p = &parent->subdir; *p; p=&(*p)->next ) { | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 		if (proc_match(len, fn, *p)) { | 
					
						
							|  |  |  | 			de = *p; | 
					
						
							|  |  |  | 			*p = de->next; | 
					
						
							|  |  |  | 			de->next = NULL; | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	spin_unlock(&proc_subdir_lock); | 
					
						
							|  |  |  | 	if (!de) | 
					
						
							|  |  |  | 		return; | 
					
						
							| 
									
										
											  
											
												Fix rmmod/read/write races in /proc entries
Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.
   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()
  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */
NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.
NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-07-15 23:39:00 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 	spin_lock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Stop accepting new callers into module. If you're | 
					
						
							|  |  |  | 	 * dynamically allocating ->proc_fops, save a pointer somewhere. | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	de->proc_fops = NULL; | 
					
						
							|  |  |  | 	/* Wait until all existing callers into module are done. */ | 
					
						
							|  |  |  | 	if (de->pde_users > 0) { | 
					
						
							|  |  |  | 		DECLARE_COMPLETION_ONSTACK(c); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if (!de->pde_unload_completion) | 
					
						
							|  |  |  | 			de->pde_unload_completion = &c; | 
					
						
							| 
									
										
											  
											
												Fix rmmod/read/write races in /proc entries
Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.
   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()
  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */
NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.
NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-07-15 23:39:00 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		spin_unlock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 		wait_for_completion(de->pde_unload_completion); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		goto continue_removing; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	spin_unlock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Fix rmmod/read/write races in /proc entries
Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.
   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()
  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */
NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.
NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2007-07-15 23:39:00 -07:00
										 |  |  | continue_removing: | 
					
						
							| 
									
										
										
										
											2008-07-25 01:48:29 -07:00
										 |  |  | 	spin_lock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 	while (!list_empty(&de->pde_openers)) { | 
					
						
							|  |  |  | 		struct pde_opener *pdeo; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); | 
					
						
							|  |  |  | 		list_del(&pdeo->lh); | 
					
						
							|  |  |  | 		spin_unlock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 		pdeo->release(pdeo->inode, pdeo->file); | 
					
						
							|  |  |  | 		kfree(pdeo); | 
					
						
							|  |  |  | 		spin_lock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	spin_unlock(&de->pde_unload_lock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 	if (S_ISDIR(de->mode)) | 
					
						
							|  |  |  | 		parent->nlink--; | 
					
						
							|  |  |  | 	de->nlink = 0; | 
					
						
							| 
									
										
										
										
											2008-07-25 19:45:41 -07:00
										 |  |  | 	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " | 
					
						
							| 
									
										
										
										
											2008-04-29 01:01:39 -07:00
										 |  |  | 			"'%s/%s', leaking at least '%s'\n", __func__, | 
					
						
							|  |  |  | 			de->parent->name, de->name, de->subdir->name); | 
					
						
							|  |  |  | 	if (atomic_dec_and_test(&de->count)) | 
					
						
							|  |  |  | 		free_proc_entry(de); | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | } |