| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is free software; you can redistribute it and/or | 
					
						
							|  |  |  |  * modify it under the terms of version 2 of the GNU General Public | 
					
						
							|  |  |  |  * License as published by the Free Software Foundation. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is distributed in the hope that it will be useful, but | 
					
						
							|  |  |  |  * WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 
					
						
							|  |  |  |  * General Public License for more details. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #include <linux/bpf.h>
 | 
					
						
							|  |  |  | #include <linux/syscalls.h>
 | 
					
						
							|  |  |  | #include <linux/slab.h>
 | 
					
						
							|  |  |  | #include <linux/anon_inodes.h>
 | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | #include <linux/file.h>
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | #include <linux/license.h>
 | 
					
						
							|  |  |  | #include <linux/filter.h>
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | static LIST_HEAD(bpf_map_types); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_map_type_list *tl; | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	list_for_each_entry(tl, &bpf_map_types, list_node) { | 
					
						
							|  |  |  | 		if (tl->type == attr->map_type) { | 
					
						
							|  |  |  | 			map = tl->ops->map_alloc(attr); | 
					
						
							|  |  |  | 			if (IS_ERR(map)) | 
					
						
							|  |  |  | 				return map; | 
					
						
							|  |  |  | 			map->ops = tl->ops; | 
					
						
							|  |  |  | 			map->map_type = attr->map_type; | 
					
						
							|  |  |  | 			return map; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return ERR_PTR(-EINVAL); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* boot time registration of different map implementations */ | 
					
						
							|  |  |  | void bpf_register_map_type(struct bpf_map_type_list *tl) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	list_add(&tl->list_node, &bpf_map_types); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* called from workqueue */ | 
					
						
							|  |  |  | static void bpf_map_free_deferred(struct work_struct *work) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_map *map = container_of(work, struct bpf_map, work); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* implementation dependent freeing */ | 
					
						
							|  |  |  | 	map->ops->map_free(map); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* decrement map refcnt and schedule it for freeing via workqueue
 | 
					
						
							|  |  |  |  * (unrelying map implementation ops->map_free() might sleep) | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | void bpf_map_put(struct bpf_map *map) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	if (atomic_dec_and_test(&map->refcnt)) { | 
					
						
							|  |  |  | 		INIT_WORK(&map->work, bpf_map_free_deferred); | 
					
						
							|  |  |  | 		schedule_work(&map->work); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int bpf_map_release(struct inode *inode, struct file *filp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_map *map = filp->private_data; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	bpf_map_put(map); | 
					
						
							|  |  |  | 	return 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const struct file_operations bpf_map_fops = { | 
					
						
							|  |  |  | 	.release = bpf_map_release, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | 
					
						
							|  |  |  | #define CHECK_ATTR(CMD) \
 | 
					
						
							|  |  |  | 	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | 
					
						
							|  |  |  | 		   sizeof(attr->CMD##_LAST_FIELD), 0, \ | 
					
						
							|  |  |  | 		   sizeof(*attr) - \ | 
					
						
							|  |  |  | 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | 
					
						
							|  |  |  | 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define BPF_MAP_CREATE_LAST_FIELD max_entries
 | 
					
						
							|  |  |  | /* called via syscall */ | 
					
						
							|  |  |  | static int map_create(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = CHECK_ATTR(BPF_MAP_CREATE); | 
					
						
							|  |  |  | 	if (err) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | 
					
						
							|  |  |  | 	map = find_and_alloc_map(attr); | 
					
						
							|  |  |  | 	if (IS_ERR(map)) | 
					
						
							|  |  |  | 		return PTR_ERR(map); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	atomic_set(&map->refcnt, 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (err < 0) | 
					
						
							|  |  |  | 		/* failed to allocate fd */ | 
					
						
							|  |  |  | 		goto free_map; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | free_map: | 
					
						
							|  |  |  | 	map->ops->map_free(map); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | /* if error is returned, fd is released.
 | 
					
						
							|  |  |  |  * On success caller should complete fd access with matching fdput() | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | struct bpf_map *bpf_map_get(struct fd f) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (!f.file) | 
					
						
							|  |  |  | 		return ERR_PTR(-EBADF); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (f.file->f_op != &bpf_map_fops) { | 
					
						
							|  |  |  | 		fdput(f); | 
					
						
							|  |  |  | 		return ERR_PTR(-EINVAL); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	map = f.file->private_data; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return map; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* helper to convert user pointers passed inside __aligned_u64 fields */ | 
					
						
							|  |  |  | static void __user *u64_to_ptr(__u64 val) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	return (void __user *) (unsigned long) val; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* last field in 'union bpf_attr' used by this command */ | 
					
						
							|  |  |  | #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int map_lookup_elem(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	void __user *ukey = u64_to_ptr(attr->key); | 
					
						
							|  |  |  | 	void __user *uvalue = u64_to_ptr(attr->value); | 
					
						
							|  |  |  | 	int ufd = attr->map_fd; | 
					
						
							|  |  |  | 	struct fd f = fdget(ufd); | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 	void *key, *value; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	map = bpf_map_get(f); | 
					
						
							|  |  |  | 	if (IS_ERR(map)) | 
					
						
							|  |  |  | 		return PTR_ERR(map); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	key = kmalloc(map->key_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!key) | 
					
						
							|  |  |  | 		goto err_put; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(key, ukey, map->key_size) != 0) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ESRCH; | 
					
						
							|  |  |  | 	rcu_read_lock(); | 
					
						
							|  |  |  | 	value = map->ops->map_lookup_elem(map, key); | 
					
						
							|  |  |  | 	if (!value) | 
					
						
							|  |  |  | 		goto err_unlock; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_to_user(uvalue, value, map->value_size) != 0) | 
					
						
							|  |  |  | 		goto err_unlock; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | err_unlock: | 
					
						
							|  |  |  | 	rcu_read_unlock(); | 
					
						
							|  |  |  | free_key: | 
					
						
							|  |  |  | 	kfree(key); | 
					
						
							|  |  |  | err_put: | 
					
						
							|  |  |  | 	fdput(f); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int map_update_elem(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	void __user *ukey = u64_to_ptr(attr->key); | 
					
						
							|  |  |  | 	void __user *uvalue = u64_to_ptr(attr->value); | 
					
						
							|  |  |  | 	int ufd = attr->map_fd; | 
					
						
							|  |  |  | 	struct fd f = fdget(ufd); | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 	void *key, *value; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	map = bpf_map_get(f); | 
					
						
							|  |  |  | 	if (IS_ERR(map)) | 
					
						
							|  |  |  | 		return PTR_ERR(map); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	key = kmalloc(map->key_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!key) | 
					
						
							|  |  |  | 		goto err_put; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(key, ukey, map->key_size) != 0) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	value = kmalloc(map->value_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!value) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(value, uvalue, map->value_size) != 0) | 
					
						
							|  |  |  | 		goto free_value; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* eBPF program that use maps are running under rcu_read_lock(),
 | 
					
						
							|  |  |  | 	 * therefore all map accessors rely on this fact, so do the same here | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	rcu_read_lock(); | 
					
						
							|  |  |  | 	err = map->ops->map_update_elem(map, key, value); | 
					
						
							|  |  |  | 	rcu_read_unlock(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | free_value: | 
					
						
							|  |  |  | 	kfree(value); | 
					
						
							|  |  |  | free_key: | 
					
						
							|  |  |  | 	kfree(key); | 
					
						
							|  |  |  | err_put: | 
					
						
							|  |  |  | 	fdput(f); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int map_delete_elem(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	void __user *ukey = u64_to_ptr(attr->key); | 
					
						
							|  |  |  | 	int ufd = attr->map_fd; | 
					
						
							|  |  |  | 	struct fd f = fdget(ufd); | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 	void *key; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	map = bpf_map_get(f); | 
					
						
							|  |  |  | 	if (IS_ERR(map)) | 
					
						
							|  |  |  | 		return PTR_ERR(map); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	key = kmalloc(map->key_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!key) | 
					
						
							|  |  |  | 		goto err_put; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(key, ukey, map->key_size) != 0) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	rcu_read_lock(); | 
					
						
							|  |  |  | 	err = map->ops->map_delete_elem(map, key); | 
					
						
							|  |  |  | 	rcu_read_unlock(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | free_key: | 
					
						
							|  |  |  | 	kfree(key); | 
					
						
							|  |  |  | err_put: | 
					
						
							|  |  |  | 	fdput(f); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* last field in 'union bpf_attr' used by this command */ | 
					
						
							|  |  |  | #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int map_get_next_key(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	void __user *ukey = u64_to_ptr(attr->key); | 
					
						
							|  |  |  | 	void __user *unext_key = u64_to_ptr(attr->next_key); | 
					
						
							|  |  |  | 	int ufd = attr->map_fd; | 
					
						
							|  |  |  | 	struct fd f = fdget(ufd); | 
					
						
							|  |  |  | 	struct bpf_map *map; | 
					
						
							|  |  |  | 	void *key, *next_key; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	map = bpf_map_get(f); | 
					
						
							|  |  |  | 	if (IS_ERR(map)) | 
					
						
							|  |  |  | 		return PTR_ERR(map); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	key = kmalloc(map->key_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!key) | 
					
						
							|  |  |  | 		goto err_put; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(key, ukey, map->key_size) != 0) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -ENOMEM; | 
					
						
							|  |  |  | 	next_key = kmalloc(map->key_size, GFP_USER); | 
					
						
							|  |  |  | 	if (!next_key) | 
					
						
							|  |  |  | 		goto free_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	rcu_read_lock(); | 
					
						
							|  |  |  | 	err = map->ops->map_get_next_key(map, key, next_key); | 
					
						
							|  |  |  | 	rcu_read_unlock(); | 
					
						
							|  |  |  | 	if (err) | 
					
						
							|  |  |  | 		goto free_next_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_to_user(unext_key, next_key, map->key_size) != 0) | 
					
						
							|  |  |  | 		goto free_next_key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | free_next_key: | 
					
						
							|  |  |  | 	kfree(next_key); | 
					
						
							|  |  |  | free_key: | 
					
						
							|  |  |  | 	kfree(key); | 
					
						
							|  |  |  | err_put: | 
					
						
							|  |  |  | 	fdput(f); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | static LIST_HEAD(bpf_prog_types); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_prog_type_list *tl; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	list_for_each_entry(tl, &bpf_prog_types, list_node) { | 
					
						
							|  |  |  | 		if (tl->type == type) { | 
					
						
							|  |  |  | 			prog->aux->ops = tl->ops; | 
					
						
							|  |  |  | 			prog->aux->prog_type = type; | 
					
						
							|  |  |  | 			return 0; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return -EINVAL; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void bpf_register_prog_type(struct bpf_prog_type_list *tl) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	list_add(&tl->list_node, &bpf_prog_types); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:01 -07:00
										 |  |  | /* fixup insn->imm field of bpf_call instructions:
 | 
					
						
							|  |  |  |  * if (insn->imm == BPF_FUNC_map_lookup_elem) | 
					
						
							|  |  |  |  *      insn->imm = bpf_map_lookup_elem - __bpf_call_base; | 
					
						
							|  |  |  |  * else if (insn->imm == BPF_FUNC_map_update_elem) | 
					
						
							|  |  |  |  *      insn->imm = bpf_map_update_elem - __bpf_call_base; | 
					
						
							|  |  |  |  * else ... | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * this function is called after eBPF program passed verification | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | static void fixup_bpf_calls(struct bpf_prog *prog) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	const struct bpf_func_proto *fn; | 
					
						
							|  |  |  | 	int i; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for (i = 0; i < prog->len; i++) { | 
					
						
							|  |  |  | 		struct bpf_insn *insn = &prog->insnsi[i]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if (insn->code == (BPF_JMP | BPF_CALL)) { | 
					
						
							|  |  |  | 			/* we reach here when program has bpf_call instructions
 | 
					
						
							|  |  |  | 			 * and it passed bpf_check(), means that | 
					
						
							|  |  |  | 			 * ops->get_func_proto must have been supplied, check it | 
					
						
							|  |  |  | 			 */ | 
					
						
							|  |  |  | 			BUG_ON(!prog->aux->ops->get_func_proto); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			fn = prog->aux->ops->get_func_proto(insn->imm); | 
					
						
							|  |  |  | 			/* all functions that have prototype and verifier allowed
 | 
					
						
							|  |  |  | 			 * programs to call them, must be real in-kernel functions | 
					
						
							|  |  |  | 			 */ | 
					
						
							|  |  |  | 			BUG_ON(!fn->func); | 
					
						
							|  |  |  | 			insn->imm = fn->func - __bpf_call_base; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | /* drop refcnt on maps used by eBPF program and free auxilary data */ | 
					
						
							|  |  |  | static void free_used_maps(struct bpf_prog_aux *aux) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	int i; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for (i = 0; i < aux->used_map_cnt; i++) | 
					
						
							|  |  |  | 		bpf_map_put(aux->used_maps[i]); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	kfree(aux->used_maps); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void bpf_prog_put(struct bpf_prog *prog) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	if (atomic_dec_and_test(&prog->aux->refcnt)) { | 
					
						
							|  |  |  | 		free_used_maps(prog->aux); | 
					
						
							|  |  |  | 		bpf_prog_free(prog); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int bpf_prog_release(struct inode *inode, struct file *filp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_prog *prog = filp->private_data; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	bpf_prog_put(prog); | 
					
						
							|  |  |  | 	return 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const struct file_operations bpf_prog_fops = { | 
					
						
							|  |  |  |         .release = bpf_prog_release, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static struct bpf_prog *get_prog(struct fd f) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct bpf_prog *prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (!f.file) | 
					
						
							|  |  |  | 		return ERR_PTR(-EBADF); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (f.file->f_op != &bpf_prog_fops) { | 
					
						
							|  |  |  | 		fdput(f); | 
					
						
							|  |  |  | 		return ERR_PTR(-EINVAL); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	prog = f.file->private_data; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return prog; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* called by sockets/tracing/seccomp before attaching program to an event
 | 
					
						
							|  |  |  |  * pairs with bpf_prog_put() | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | struct bpf_prog *bpf_prog_get(u32 ufd) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	struct fd f = fdget(ufd); | 
					
						
							|  |  |  | 	struct bpf_prog *prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	prog = get_prog(f); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (IS_ERR(prog)) | 
					
						
							|  |  |  | 		return prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	atomic_inc(&prog->aux->refcnt); | 
					
						
							|  |  |  | 	fdput(f); | 
					
						
							|  |  |  | 	return prog; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* last field in 'union bpf_attr' used by this command */ | 
					
						
							| 
									
										
											  
											
												bpf: verifier (add ability to receive verification log)
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
    struct {
	...
	__u32         log_level; /* verbosity level of eBPF verifier */
	__u32         log_size;  /* size of user buffer */
	__aligned_u64 log_buf;   /* user supplied 'char *buffer' */
    };
};
when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.
'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:
  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
  BPF_LD_MAP_FD(BPF_REG_1, 0),
  BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
  BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
  BPF_EXIT_INSN(),
will be rejected with the following multi-line message in log_buf:
  0: (7a) *(u64 *)(r10 -8) = 0
  1: (bf) r2 = r10
  2: (07) r2 += -8
  3: (b7) r1 = 0
  4: (85) call 1
  5: (15) if r0 == 0x0 goto pc+1
   R0=map_ptr R10=fp
  6: (7a) *(u64 *)(r0 +4) = 0
  misaligned access off 4 size 8
The format of the output can change at any time as verifier evolves.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:17:03 -07:00
										 |  |  | #define	BPF_PROG_LOAD_LAST_FIELD log_buf
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | static int bpf_prog_load(union bpf_attr *attr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	enum bpf_prog_type type = attr->prog_type; | 
					
						
							|  |  |  | 	struct bpf_prog *prog; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 	char license[128]; | 
					
						
							|  |  |  | 	bool is_gpl; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (CHECK_ATTR(BPF_PROG_LOAD)) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* copy eBPF program license from user space */ | 
					
						
							|  |  |  | 	if (strncpy_from_user(license, u64_to_ptr(attr->license), | 
					
						
							|  |  |  | 			      sizeof(license) - 1) < 0) | 
					
						
							|  |  |  | 		return -EFAULT; | 
					
						
							|  |  |  | 	license[sizeof(license) - 1] = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* eBPF programs must be GPL compatible to use GPL-ed functions */ | 
					
						
							|  |  |  | 	is_gpl = license_is_gpl_compatible(license); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (attr->insn_cnt >= BPF_MAXINSNS) | 
					
						
							|  |  |  | 		return -EINVAL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* plain bpf_prog allocation */ | 
					
						
							|  |  |  | 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | 
					
						
							|  |  |  | 	if (!prog) | 
					
						
							|  |  |  | 		return -ENOMEM; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	prog->len = attr->insn_cnt; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = -EFAULT; | 
					
						
							|  |  |  | 	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), | 
					
						
							|  |  |  | 			   prog->len * sizeof(struct bpf_insn)) != 0) | 
					
						
							|  |  |  | 		goto free_prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	prog->orig_prog = NULL; | 
					
						
							|  |  |  | 	prog->jited = false; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	atomic_set(&prog->aux->refcnt, 1); | 
					
						
							|  |  |  | 	prog->aux->is_gpl_compatible = is_gpl; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* find program type: socket_filter vs tracing_filter */ | 
					
						
							|  |  |  | 	err = find_prog_type(type, prog); | 
					
						
							|  |  |  | 	if (err < 0) | 
					
						
							|  |  |  | 		goto free_prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* run eBPF verifier */ | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:02 -07:00
										 |  |  | 	err = bpf_check(prog, attr); | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	if (err < 0) | 
					
						
							|  |  |  | 		goto free_used_maps; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:01 -07:00
										 |  |  | 	/* fixup BPF_CALL->imm field */ | 
					
						
							|  |  |  | 	fixup_bpf_calls(prog); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 	/* eBPF program is ready to be JITed */ | 
					
						
							|  |  |  | 	bpf_prog_select_runtime(prog); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (err < 0) | 
					
						
							|  |  |  | 		/* failed to allocate fd */ | 
					
						
							|  |  |  | 		goto free_used_maps; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | free_used_maps: | 
					
						
							|  |  |  | 	free_used_maps(prog->aux); | 
					
						
							|  |  |  | free_prog: | 
					
						
							|  |  |  | 	bpf_prog_free(prog); | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	union bpf_attr attr = {}; | 
					
						
							|  |  |  | 	int err; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* the syscall is limited to root temporarily. This restriction will be
 | 
					
						
							|  |  |  | 	 * lifted when security audit is clean. Note that eBPF+tracing must have | 
					
						
							|  |  |  | 	 * this restriction, since it may pass kernel data to user space | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	if (!capable(CAP_SYS_ADMIN)) | 
					
						
							|  |  |  | 		return -EPERM; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (!access_ok(VERIFY_READ, uattr, 1)) | 
					
						
							|  |  |  | 		return -EFAULT; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if (size > PAGE_SIZE)	/* silly large */ | 
					
						
							|  |  |  | 		return -E2BIG; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* If we're handed a bigger struct than we know of,
 | 
					
						
							|  |  |  | 	 * ensure all the unknown bits are 0 - i.e. new | 
					
						
							|  |  |  | 	 * user-space does not rely on any kernel feature | 
					
						
							|  |  |  | 	 * extensions we dont know about yet. | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	if (size > sizeof(attr)) { | 
					
						
							|  |  |  | 		unsigned char __user *addr; | 
					
						
							|  |  |  | 		unsigned char __user *end; | 
					
						
							|  |  |  | 		unsigned char val; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		addr = (void __user *)uattr + sizeof(attr); | 
					
						
							|  |  |  | 		end  = (void __user *)uattr + size; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		for (; addr < end; addr++) { | 
					
						
							|  |  |  | 			err = get_user(val, addr); | 
					
						
							|  |  |  | 			if (err) | 
					
						
							|  |  |  | 				return err; | 
					
						
							|  |  |  | 			if (val) | 
					
						
							|  |  |  | 				return -E2BIG; | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		size = sizeof(attr); | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */ | 
					
						
							|  |  |  | 	if (copy_from_user(&attr, uattr, size) != 0) | 
					
						
							|  |  |  | 		return -EFAULT; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	switch (cmd) { | 
					
						
							|  |  |  | 	case BPF_MAP_CREATE: | 
					
						
							|  |  |  | 		err = map_create(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | 	case BPF_MAP_LOOKUP_ELEM: | 
					
						
							|  |  |  | 		err = map_lookup_elem(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							|  |  |  | 	case BPF_MAP_UPDATE_ELEM: | 
					
						
							|  |  |  | 		err = map_update_elem(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							|  |  |  | 	case BPF_MAP_DELETE_ELEM: | 
					
						
							|  |  |  | 		err = map_delete_elem(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							|  |  |  | 	case BPF_MAP_GET_NEXT_KEY: | 
					
						
							|  |  |  | 		err = map_get_next_key(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 	case BPF_PROG_LOAD: | 
					
						
							|  |  |  | 		err = bpf_prog_load(&attr); | 
					
						
							|  |  |  | 		break; | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | 	default: | 
					
						
							|  |  |  | 		err = -EINVAL; | 
					
						
							|  |  |  | 		break; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return err; | 
					
						
							|  |  |  | } |