 c2e1f2e30d
			
		
	
	
	c2e1f2e30d
	
	
	
		
			
			Applying restrictive seccomp filter programs to large or diverse codebases often requires handling threads which may be started early in the process lifetime (e.g., by code that is linked in). While it is possible to apply permissive programs prior to process start up, it is difficult to further restrict the kernel ABI to those threads after that point. This change adds a new seccomp syscall flag to SECCOMP_SET_MODE_FILTER for synchronizing thread group seccomp filters at filter installation time. When calling seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, filter) an attempt will be made to synchronize all threads in current's threadgroup to its new seccomp filter program. This is possible iff all threads are using a filter that is an ancestor to the filter current is attempting to synchronize to. NULL filters (where the task is running as SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be transitioned into SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS, ...) has been set on the calling thread, no_new_privs will be set for all synchronized threads too. On success, 0 is returned. On failure, the pid of one of the failing threads will be returned and no filters will have been applied. The race conditions against another thread are: - requesting TSYNC (already handled by sighand lock) - performing a clone (already handled by sighand lock) - changing its filter (already handled by sighand lock) - calling exec (handled by cred_guard_mutex) The clone case is assisted by the fact that new threads will have their seccomp state duplicated from their parent before appearing on the tasklist. Holding cred_guard_mutex means that seccomp filters cannot be assigned while in the middle of another thread's exec (potentially bypassing no_new_privs or similar). The call to de_thread() may kill threads waiting for the mutex. Changes across threads to the filter pointer includes a barrier. Based on patches by Will Drewry. Suggested-by: Julien Tinnes <jln@chromium.org> Signed-off-by: Kees Cook <keescook@chromium.org> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Andy Lutomirski <luto@amacapital.net>
		
			
				
	
	
		
			54 lines
		
	
	
	
		
			1.8 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
	
		
			1.8 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _UAPI_LINUX_SECCOMP_H
 | |
| #define _UAPI_LINUX_SECCOMP_H
 | |
| 
 | |
| #include <linux/compiler.h>
 | |
| #include <linux/types.h>
 | |
| 
 | |
| 
 | |
| /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
 | |
| #define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
 | |
| #define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
 | |
| #define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
 | |
| 
 | |
| /* Valid operations for seccomp syscall. */
 | |
| #define SECCOMP_SET_MODE_STRICT	0
 | |
| #define SECCOMP_SET_MODE_FILTER	1
 | |
| 
 | |
| /* Valid flags for SECCOMP_SET_MODE_FILTER */
 | |
| #define SECCOMP_FILTER_FLAG_TSYNC	1
 | |
| 
 | |
| /*
 | |
|  * All BPF programs must return a 32-bit value.
 | |
|  * The bottom 16-bits are for optional return data.
 | |
|  * The upper 16-bits are ordered from least permissive values to most.
 | |
|  *
 | |
|  * The ordering ensures that a min_t() over composed return values always
 | |
|  * selects the least permissive choice.
 | |
|  */
 | |
| #define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
 | |
| #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 | |
| #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 | |
| #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
 | |
| #define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
 | |
| 
 | |
| /* Masks for the return value sections. */
 | |
| #define SECCOMP_RET_ACTION	0x7fff0000U
 | |
| #define SECCOMP_RET_DATA	0x0000ffffU
 | |
| 
 | |
| /**
 | |
|  * struct seccomp_data - the format the BPF program executes over.
 | |
|  * @nr: the system call number
 | |
|  * @arch: indicates system call convention as an AUDIT_ARCH_* value
 | |
|  *        as defined in <linux/audit.h>.
 | |
|  * @instruction_pointer: at the time of the system call.
 | |
|  * @args: up to 6 system call arguments always stored as 64-bit values
 | |
|  *        regardless of the architecture.
 | |
|  */
 | |
| struct seccomp_data {
 | |
| 	int nr;
 | |
| 	__u32 arch;
 | |
| 	__u64 instruction_pointer;
 | |
| 	__u64 args[6];
 | |
| };
 | |
| 
 | |
| #endif /* _UAPI_LINUX_SECCOMP_H */
 |