| 
									
										
										
										
											2014-09-04 22:17:18 -07:00
										 |  |  | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is free software; you can redistribute it and/or | 
					
						
							|  |  |  |  * modify it under the terms of version 2 of the GNU General Public | 
					
						
							|  |  |  |  * License as published by the Free Software Foundation. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #ifndef _UAPI__LINUX_BPF_H__
 | 
					
						
							|  |  |  | #define _UAPI__LINUX_BPF_H__
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include <linux/types.h>
 | 
					
						
							| 
									
										
										
										
											2014-10-14 02:08:54 -07:00
										 |  |  | #include <linux/bpf_common.h>
 | 
					
						
							| 
									
										
										
										
											2014-09-04 22:17:18 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | /* Extended instruction set based on top of classic BPF */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* instruction classes */ | 
					
						
							|  |  |  | #define BPF_ALU64	0x07	/* alu mode in double word width */
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* ld/ldx fields */ | 
					
						
							|  |  |  | #define BPF_DW		0x18	/* double word */
 | 
					
						
							|  |  |  | #define BPF_XADD	0xc0	/* exclusive add */
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* alu/jmp fields */ | 
					
						
							|  |  |  | #define BPF_MOV		0xb0	/* mov reg to reg */
 | 
					
						
							|  |  |  | #define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* change endianness of a register */ | 
					
						
							|  |  |  | #define BPF_END		0xd0	/* flags for endianness conversion: */
 | 
					
						
							|  |  |  | #define BPF_TO_LE	0x00	/* convert to little-endian */
 | 
					
						
							|  |  |  | #define BPF_TO_BE	0x08	/* convert to big-endian */
 | 
					
						
							|  |  |  | #define BPF_FROM_LE	BPF_TO_LE
 | 
					
						
							|  |  |  | #define BPF_FROM_BE	BPF_TO_BE
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define BPF_JNE		0x50	/* jump != */
 | 
					
						
							|  |  |  | #define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
 | 
					
						
							|  |  |  | #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
 | 
					
						
							|  |  |  | #define BPF_CALL	0x80	/* function call */
 | 
					
						
							|  |  |  | #define BPF_EXIT	0x90	/* function return */
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Register numbers */ | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	BPF_REG_0 = 0, | 
					
						
							|  |  |  | 	BPF_REG_1, | 
					
						
							|  |  |  | 	BPF_REG_2, | 
					
						
							|  |  |  | 	BPF_REG_3, | 
					
						
							|  |  |  | 	BPF_REG_4, | 
					
						
							|  |  |  | 	BPF_REG_5, | 
					
						
							|  |  |  | 	BPF_REG_6, | 
					
						
							|  |  |  | 	BPF_REG_7, | 
					
						
							|  |  |  | 	BPF_REG_8, | 
					
						
							|  |  |  | 	BPF_REG_9, | 
					
						
							|  |  |  | 	BPF_REG_10, | 
					
						
							|  |  |  | 	__MAX_BPF_REG, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* BPF has 10 general purpose 64-bit registers and stack frame. */ | 
					
						
							|  |  |  | #define MAX_BPF_REG	__MAX_BPF_REG
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct bpf_insn { | 
					
						
							|  |  |  | 	__u8	code;		/* opcode */ | 
					
						
							|  |  |  | 	__u8	dst_reg:4;	/* dest register */ | 
					
						
							|  |  |  | 	__u8	src_reg:4;	/* source register */ | 
					
						
							|  |  |  | 	__s16	off;		/* signed offset */ | 
					
						
							|  |  |  | 	__s32	imm;		/* signed immediate constant */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | /* BPF syscall commands */ | 
					
						
							|  |  |  | enum bpf_cmd { | 
					
						
							|  |  |  | 	/* create a map with given type and attributes
 | 
					
						
							|  |  |  | 	 * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) | 
					
						
							|  |  |  | 	 * returns fd or negative error | 
					
						
							|  |  |  | 	 * map is deleted when fd is closed | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_MAP_CREATE, | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* lookup key in a given map
 | 
					
						
							|  |  |  | 	 * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) | 
					
						
							|  |  |  | 	 * Using attr->map_fd, attr->key, attr->value | 
					
						
							|  |  |  | 	 * returns zero and stores found elem into value | 
					
						
							|  |  |  | 	 * or negative error | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_MAP_LOOKUP_ELEM, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* create or update key/value pair in a given map
 | 
					
						
							|  |  |  | 	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) | 
					
						
							| 
									
										
											  
											
												bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
 #define BPF_ANY	0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags;
    };
    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-11-13 17:36:44 -08:00
										 |  |  | 	 * Using attr->map_fd, attr->key, attr->value, attr->flags | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | 	 * returns zero or negative error | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_MAP_UPDATE_ELEM, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* find and delete elem by key in a given map
 | 
					
						
							|  |  |  | 	 * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) | 
					
						
							|  |  |  | 	 * Using attr->map_fd, attr->key | 
					
						
							|  |  |  | 	 * returns zero or negative error | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_MAP_DELETE_ELEM, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* lookup key in a given map and return next key
 | 
					
						
							|  |  |  | 	 * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) | 
					
						
							|  |  |  | 	 * Using attr->map_fd, attr->key, attr->next_key | 
					
						
							|  |  |  | 	 * returns zero and stores next key or negative error | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_MAP_GET_NEXT_KEY, | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* verify and load eBPF program
 | 
					
						
							|  |  |  | 	 * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) | 
					
						
							|  |  |  | 	 * Using attr->prog_type, attr->insns, attr->license | 
					
						
							|  |  |  | 	 * returns fd or negative error | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_PROG_LOAD, | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum bpf_map_type { | 
					
						
							|  |  |  | 	BPF_MAP_TYPE_UNSPEC, | 
					
						
							| 
									
										
										
										
											2014-11-13 17:36:45 -08:00
										 |  |  | 	BPF_MAP_TYPE_HASH, | 
					
						
							| 
									
										
										
										
											2014-11-13 17:36:46 -08:00
										 |  |  | 	BPF_MAP_TYPE_ARRAY, | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | enum bpf_prog_type { | 
					
						
							|  |  |  | 	BPF_PROG_TYPE_UNSPEC, | 
					
						
							| 
									
										
										
										
											2014-12-01 15:06:34 -08:00
										 |  |  | 	BPF_PROG_TYPE_SOCKET_FILTER, | 
					
						
							| 
									
										
											  
											
												tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};
	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
	close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
  - lookup/update/delete elements in maps
  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
											
										 
											2015-03-25 12:49:20 -07:00
										 |  |  | 	BPF_PROG_TYPE_KPROBE, | 
					
						
							| 
									
										
											  
											
												ebpf: add sched_cls_type and map it to sk_filter's verifier ops
As discussed recently and at netconf/netdev01, we want to prevent making
bpf_verifier_ops registration available for modules, but have them at a
controlled place inside the kernel instead.
The reason for this is, that out-of-tree modules can go crazy and define
and register any verfifier ops they want, doing all sorts of crap, even
bypassing available GPLed eBPF helper functions. We don't want to offer
such a shiny playground, of course, but keep strict control to ourselves
inside the core kernel.
This also encourages us to design eBPF user helpers carefully and
generically, so they can be shared among various subsystems using eBPF.
For the eBPF traffic classifier (cls_bpf), it's a good start to share
the same helper facilities as we currently do in eBPF for socket filters.
That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus
one day if there's a good reason to diverge the set of helper functions
from the set available to socket filters, we keep ABI compatibility.
In future, we could place all bpf_prog_type_list at a central place,
perhaps.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2015-03-01 12:31:46 +01:00
										 |  |  | 	BPF_PROG_TYPE_SCHED_CLS, | 
					
						
							| 
									
										
										
										
											2015-03-20 15:11:11 +01:00
										 |  |  | 	BPF_PROG_TYPE_SCHED_ACT, | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-01 12:31:43 +01:00
										 |  |  | #define BPF_PSEUDO_MAP_FD	1
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
 #define BPF_ANY	0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags;
    };
    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-11-13 17:36:44 -08:00
										 |  |  | /* flags for BPF_MAP_UPDATE_ELEM command */ | 
					
						
							|  |  |  | #define BPF_ANY		0 /* create new element or update existing */
 | 
					
						
							|  |  |  | #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 | 
					
						
							|  |  |  | #define BPF_EXIST	2 /* update existing element */
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | union bpf_attr { | 
					
						
							|  |  |  | 	struct { /* anonymous struct used by BPF_MAP_CREATE command */ | 
					
						
							|  |  |  | 		__u32	map_type;	/* one of enum bpf_map_type */ | 
					
						
							|  |  |  | 		__u32	key_size;	/* size of key in bytes */ | 
					
						
							|  |  |  | 		__u32	value_size;	/* size of value in bytes */ | 
					
						
							|  |  |  | 		__u32	max_entries;	/* max number of entries in a map */ | 
					
						
							|  |  |  | 	}; | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ | 
					
						
							|  |  |  | 		__u32		map_fd; | 
					
						
							|  |  |  | 		__aligned_u64	key; | 
					
						
							|  |  |  | 		union { | 
					
						
							|  |  |  | 			__aligned_u64 value; | 
					
						
							|  |  |  | 			__aligned_u64 next_key; | 
					
						
							|  |  |  | 		}; | 
					
						
							| 
									
										
											  
											
												bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
 #define BPF_ANY	0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags;
    };
    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-11-13 17:36:44 -08:00
										 |  |  | 		__u64		flags; | 
					
						
							| 
									
										
											  
											
												bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error
- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error
- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:16:59 -07:00
										 |  |  | 	}; | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	struct { /* anonymous struct used by BPF_PROG_LOAD command */ | 
					
						
							|  |  |  | 		__u32		prog_type;	/* one of enum bpf_prog_type */ | 
					
						
							|  |  |  | 		__u32		insn_cnt; | 
					
						
							|  |  |  | 		__aligned_u64	insns; | 
					
						
							|  |  |  | 		__aligned_u64	license; | 
					
						
							| 
									
										
											  
											
												bpf: verifier (add ability to receive verification log)
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
    struct {
	...
	__u32         log_level; /* verbosity level of eBPF verifier */
	__u32         log_size;  /* size of user buffer */
	__aligned_u64 log_buf;   /* user supplied 'char *buffer' */
    };
};
when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.
'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:
  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
  BPF_LD_MAP_FD(BPF_REG_1, 0),
  BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
  BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
  BPF_EXIT_INSN(),
will be rejected with the following multi-line message in log_buf:
  0: (7a) *(u64 *)(r10 -8) = 0
  1: (bf) r2 = r10
  2: (07) r2 += -8
  3: (b7) r1 = 0
  4: (85) call 1
  5: (15) if r0 == 0x0 goto pc+1
   R0=map_ptr R10=fp
  6: (7a) *(u64 *)(r0 +4) = 0
  misaligned access off 4 size 8
The format of the output can change at any time as verifier evolves.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-09-26 00:17:03 -07:00
										 |  |  | 		__u32		log_level;	/* verbosity level of verifier */ | 
					
						
							|  |  |  | 		__u32		log_size;	/* size of user buffer */ | 
					
						
							|  |  |  | 		__aligned_u64	log_buf;	/* user supplied buffer */ | 
					
						
							| 
									
										
											  
											
												tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};
	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
	close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
  - lookup/update/delete elements in maps
  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
											
										 
											2015-03-25 12:49:20 -07:00
										 |  |  | 		__u32		kern_version;	/* checked when prog_type=kprobe */ | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 	}; | 
					
						
							| 
									
										
										
										
											2014-09-26 00:16:57 -07:00
										 |  |  | } __attribute__((aligned(8))); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | /* integer value in 'imm' field of BPF_CALL instruction selects which helper
 | 
					
						
							|  |  |  |  * function eBPF program intends to call | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | enum bpf_func_id { | 
					
						
							|  |  |  | 	BPF_FUNC_unspec, | 
					
						
							| 
									
										
										
										
											2014-11-13 17:36:49 -08:00
										 |  |  | 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ | 
					
						
							|  |  |  | 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ | 
					
						
							|  |  |  | 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ | 
					
						
							| 
									
										
											  
											
												tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};
	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
	close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
  - lookup/update/delete elements in maps
  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
											
										 
											2015-03-25 12:49:20 -07:00
										 |  |  | 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */ | 
					
						
							| 
									
										
										
										
											2015-03-25 12:49:21 -07:00
										 |  |  | 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */ | 
					
						
							| 
									
										
										
										
											2015-03-25 12:49:22 -07:00
										 |  |  | 	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ | 
					
						
							| 
									
										
										
										
											2015-03-14 02:27:16 +01:00
										 |  |  | 	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ | 
					
						
							| 
									
										
										
										
											2015-03-14 02:27:17 +01:00
										 |  |  | 	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ | 
					
						
							| 
									
										
										
										
											2015-04-01 17:12:13 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/**
 | 
					
						
							|  |  |  | 	 * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet | 
					
						
							|  |  |  | 	 * @skb: pointer to skb | 
					
						
							| 
									
										
										
										
											2015-04-15 12:55:45 -07:00
										 |  |  | 	 * @offset: offset within packet from skb->mac_header | 
					
						
							| 
									
										
										
										
											2015-04-01 17:12:13 -07:00
										 |  |  | 	 * @from: pointer where to copy bytes from | 
					
						
							|  |  |  | 	 * @len: number of bytes to store into packet | 
					
						
							|  |  |  | 	 * @flags: bit 0 - if true, recompute skb->csum | 
					
						
							|  |  |  | 	 *         other bits - reserved | 
					
						
							|  |  |  | 	 * Return: 0 on success | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_FUNC_skb_store_bytes, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/**
 | 
					
						
							|  |  |  | 	 * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum | 
					
						
							|  |  |  | 	 * @skb: pointer to skb | 
					
						
							|  |  |  | 	 * @offset: offset within packet where IP checksum is located | 
					
						
							|  |  |  | 	 * @from: old value of header field | 
					
						
							|  |  |  | 	 * @to: new value of header field | 
					
						
							|  |  |  | 	 * @flags: bits 0-3 - size of header field | 
					
						
							|  |  |  | 	 *         other bits - reserved | 
					
						
							|  |  |  | 	 * Return: 0 on success | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_FUNC_l3_csum_replace, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/**
 | 
					
						
							|  |  |  | 	 * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum | 
					
						
							|  |  |  | 	 * @skb: pointer to skb | 
					
						
							|  |  |  | 	 * @offset: offset within packet where TCP/UDP checksum is located | 
					
						
							|  |  |  | 	 * @from: old value of header field | 
					
						
							|  |  |  | 	 * @to: new value of header field | 
					
						
							|  |  |  | 	 * @flags: bits 0-3 - size of header field | 
					
						
							|  |  |  | 	 *         bit 4 - is pseudo header | 
					
						
							|  |  |  | 	 *         other bits - reserved | 
					
						
							|  |  |  | 	 * Return: 0 on success | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	BPF_FUNC_l4_csum_replace, | 
					
						
							| 
									
										
										
										
											2014-09-26 00:17:00 -07:00
										 |  |  | 	__BPF_FUNC_MAX_ID, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-13 11:57:42 -07:00
										 |  |  | /* user accessible mirror of in-kernel sk_buff.
 | 
					
						
							|  |  |  |  * new fields can only be added to the end of this structure | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | struct __sk_buff { | 
					
						
							|  |  |  | 	__u32 len; | 
					
						
							|  |  |  | 	__u32 pkt_type; | 
					
						
							|  |  |  | 	__u32 mark; | 
					
						
							|  |  |  | 	__u32 queue_mapping; | 
					
						
							| 
									
										
										
										
											2015-03-16 18:06:02 -07:00
										 |  |  | 	__u32 protocol; | 
					
						
							|  |  |  | 	__u32 vlan_present; | 
					
						
							|  |  |  | 	__u32 vlan_tci; | 
					
						
							| 
									
										
										
										
											2015-03-24 14:48:41 +01:00
										 |  |  | 	__u32 vlan_proto; | 
					
						
							| 
									
										
										
										
											2015-04-03 20:52:24 +02:00
										 |  |  | 	__u32 priority; | 
					
						
							| 
									
										
										
										
											2015-03-13 11:57:42 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-04 22:17:18 -07:00
										 |  |  | #endif /* _UAPI__LINUX_BPF_H__ */
 |