 f431b634f2
			
		
	
	
	f431b634f2
	
	
	
		
			
			The tracing of ia32 compat system calls has been a bit of a pain as they
use different system call numbers than the 64bit equivalents.
I wrote a simple 'lls' program that lists files. I compiled it as a i686
ELF binary and ran it under a x86_64 box. This is the result:
echo 0 > /debug/tracing/tracing_on
echo 1 > /debug/tracing/events/syscalls/enable
echo 1 > /debug/tracing/tracing_on ; ./lls ; echo 0 > /debug/tracing/tracing_on
grep lls /debug/tracing/trace
[.. skipping calls before TS_COMPAT is set ...]
             lls-1127  [005] d...   936.409188: sys_recvfrom(fd: 0, ubuf: 4d560fc4, size: 0, flags: 8048034, addr: 8, addr_len: f7700420)
             lls-1127  [005] d...   936.409190: sys_recvfrom -> 0x8a77000
             lls-1127  [005] d...   936.409211: sys_lgetxattr(pathname: 0, name: 1000, value: 3, size: 22)
             lls-1127  [005] d...   936.409215: sys_lgetxattr -> 0xf76ff000
             lls-1127  [005] d...   936.409223: sys_dup2(oldfd: 4d55ae9b, newfd: 4)
             lls-1127  [005] d...   936.409228: sys_dup2 -> 0xfffffffffffffffe
             lls-1127  [005] d...   936.409236: sys_newfstat(fd: 4d55b085, statbuf: 80000)
             lls-1127  [005] d...   936.409242: sys_newfstat -> 0x3
             lls-1127  [005] d...   936.409243: sys_removexattr(pathname: 3, name: ffcd0060)
             lls-1127  [005] d...   936.409244: sys_removexattr -> 0x0
             lls-1127  [005] d...   936.409245: sys_lgetxattr(pathname: 0, name: 19614, value: 1, size: 2)
             lls-1127  [005] d...   936.409248: sys_lgetxattr -> 0xf76e5000
             lls-1127  [005] d...   936.409248: sys_newlstat(filename: 3, statbuf: 19614)
             lls-1127  [005] d...   936.409249: sys_newlstat -> 0x0
             lls-1127  [005] d...   936.409262: sys_newfstat(fd: f76fb588, statbuf: 80000)
             lls-1127  [005] d...   936.409279: sys_newfstat -> 0x3
             lls-1127  [005] d...   936.409279: sys_close(fd: 3)
             lls-1127  [005] d...   936.421550: sys_close -> 0x200
             lls-1127  [005] d...   936.421558: sys_removexattr(pathname: 3, name: ffcd00d0)
             lls-1127  [005] d...   936.421560: sys_removexattr -> 0x0
             lls-1127  [005] d...   936.421569: sys_lgetxattr(pathname: 4d564000, name: 1b1abc, value: 5, size: 802)
             lls-1127  [005] d...   936.421574: sys_lgetxattr -> 0x4d564000
             lls-1127  [005] d...   936.421575: sys_capget(header: 4d70f000, dataptr: 1000)
             lls-1127  [005] d...   936.421580: sys_capget -> 0x0
             lls-1127  [005] d...   936.421580: sys_lgetxattr(pathname: 4d710000, name: 3000, value: 3, size: 812)
             lls-1127  [005] d...   936.421589: sys_lgetxattr -> 0x4d710000
             lls-1127  [005] d...   936.426130: sys_lgetxattr(pathname: 4d713000, name: 2abc, value: 3, size: 32)
             lls-1127  [005] d...   936.426141: sys_lgetxattr -> 0x4d713000
             lls-1127  [005] d...   936.426145: sys_newlstat(filename: 3, statbuf: f76ff3f0)
             lls-1127  [005] d...   936.426146: sys_newlstat -> 0x0
             lls-1127  [005] d...   936.431748: sys_lgetxattr(pathname: 0, name: 1000, value: 3, size: 22)
Obviously I'm not calling newfstat with a fd of 4d55b085. The calls are
obviously incorrect, and confusing.
Other efforts have been made to fix this:
https://lkml.org/lkml/2012/3/26/367
But the real solution is to rewrite the syscall internals and come up
with a fixed solution. One that doesn't require all the kluge that the
current solution has.
Thus for now, instead of outputting incorrect data, simply ignore them.
With this patch the changes now have:
 #> grep lls /debug/tracing/trace
 #>
Compat system calls simply are not traced. If users need compat
syscalls, then they should just use the raw syscall tracepoints.
For an architecture to make their compat syscalls ignored, it must
define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS (done in asm/ftrace.h) and also
define an arch_trace_is_compat_syscall() function that will return true
if the current task should ignore tracing the syscall.
I want to stress that this change does not affect actual syscalls in any
way, shape or form. It is only used within the tracing system and
doesn't interfere with the syscall logic at all. The changes are
consolidated nicely into trace_syscalls.c and asm/ftrace.h.
I had to make one small modification to asm/thread_info.h and that was
to remove the include of asm/ftrace.h. As asm/ftrace.h required the
current_thread_info() it was causing include hell. That include was
added back in 2008 when the function graph tracer was added:
 commit caf4b323 "tracing, x86: add low level support for ftrace return tracing"
It does not need to be included there.
Link: http://lkml.kernel.org/r/1360703939.21867.99.camel@gandalf.local.home
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
		
	
			
		
			
				
	
	
		
			735 lines
		
	
	
	
		
			18 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			735 lines
		
	
	
	
		
			18 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #include <trace/syscall.h>
 | |
| #include <trace/events/syscalls.h>
 | |
| #include <linux/syscalls.h>
 | |
| #include <linux/slab.h>
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 | |
| #include <linux/ftrace.h>
 | |
| #include <linux/perf_event.h>
 | |
| #include <asm/syscall.h>
 | |
| 
 | |
| #include "trace_output.h"
 | |
| #include "trace.h"
 | |
| 
 | |
| static DEFINE_MUTEX(syscall_trace_lock);
 | |
| static int sys_refcount_enter;
 | |
| static int sys_refcount_exit;
 | |
| static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 | |
| static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 | |
| 
 | |
| static int syscall_enter_register(struct ftrace_event_call *event,
 | |
| 				 enum trace_reg type, void *data);
 | |
| static int syscall_exit_register(struct ftrace_event_call *event,
 | |
| 				 enum trace_reg type, void *data);
 | |
| 
 | |
| static struct list_head *
 | |
| syscall_get_enter_fields(struct ftrace_event_call *call)
 | |
| {
 | |
| 	struct syscall_metadata *entry = call->data;
 | |
| 
 | |
| 	return &entry->enter_fields;
 | |
| }
 | |
| 
 | |
| extern struct syscall_metadata *__start_syscalls_metadata[];
 | |
| extern struct syscall_metadata *__stop_syscalls_metadata[];
 | |
| 
 | |
| static struct syscall_metadata **syscalls_metadata;
 | |
| 
 | |
| #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
 | |
| static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
 | |
| {
 | |
| 	/*
 | |
| 	 * Only compare after the "sys" prefix. Archs that use
 | |
| 	 * syscall wrappers may have syscalls symbols aliases prefixed
 | |
| 	 * with "SyS" instead of "sys", leading to an unwanted
 | |
| 	 * mismatch.
 | |
| 	 */
 | |
| 	return !strcmp(sym + 3, name + 3);
 | |
| }
 | |
| #endif
 | |
| 
 | |
| #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
 | |
| /*
 | |
|  * Some architectures that allow for 32bit applications
 | |
|  * to run on a 64bit kernel, do not map the syscalls for
 | |
|  * the 32bit tasks the same as they do for 64bit tasks.
 | |
|  *
 | |
|  *     *cough*x86*cough*
 | |
|  *
 | |
|  * In such a case, instead of reporting the wrong syscalls,
 | |
|  * simply ignore them.
 | |
|  *
 | |
|  * For an arch to ignore the compat syscalls it needs to
 | |
|  * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
 | |
|  * define the function arch_trace_is_compat_syscall() to let
 | |
|  * the tracing system know that it should ignore it.
 | |
|  */
 | |
| static int
 | |
| trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
 | |
| {
 | |
| 	if (unlikely(arch_trace_is_compat_syscall(regs)))
 | |
| 		return -1;
 | |
| 
 | |
| 	return syscall_get_nr(task, regs);
 | |
| }
 | |
| #else
 | |
| static inline int
 | |
| trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
 | |
| {
 | |
| 	return syscall_get_nr(task, regs);
 | |
| }
 | |
| #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
 | |
| 
 | |
| static __init struct syscall_metadata *
 | |
| find_syscall_meta(unsigned long syscall)
 | |
| {
 | |
| 	struct syscall_metadata **start;
 | |
| 	struct syscall_metadata **stop;
 | |
| 	char str[KSYM_SYMBOL_LEN];
 | |
| 
 | |
| 
 | |
| 	start = __start_syscalls_metadata;
 | |
| 	stop = __stop_syscalls_metadata;
 | |
| 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 | |
| 
 | |
| 	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
 | |
| 		return NULL;
 | |
| 
 | |
| 	for ( ; start < stop; start++) {
 | |
| 		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 | |
| 			return *start;
 | |
| 	}
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| static struct syscall_metadata *syscall_nr_to_meta(int nr)
 | |
| {
 | |
| 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
 | |
| 		return NULL;
 | |
| 
 | |
| 	return syscalls_metadata[nr];
 | |
| }
 | |
| 
 | |
| static enum print_line_t
 | |
| print_syscall_enter(struct trace_iterator *iter, int flags,
 | |
| 		    struct trace_event *event)
 | |
| {
 | |
| 	struct trace_seq *s = &iter->seq;
 | |
| 	struct trace_entry *ent = iter->ent;
 | |
| 	struct syscall_trace_enter *trace;
 | |
| 	struct syscall_metadata *entry;
 | |
| 	int i, ret, syscall;
 | |
| 
 | |
| 	trace = (typeof(trace))ent;
 | |
| 	syscall = trace->nr;
 | |
| 	entry = syscall_nr_to_meta(syscall);
 | |
| 
 | |
| 	if (!entry)
 | |
| 		goto end;
 | |
| 
 | |
| 	if (entry->enter_event->event.type != ent->type) {
 | |
| 		WARN_ON_ONCE(1);
 | |
| 		goto end;
 | |
| 	}
 | |
| 
 | |
| 	ret = trace_seq_printf(s, "%s(", entry->name);
 | |
| 	if (!ret)
 | |
| 		return TRACE_TYPE_PARTIAL_LINE;
 | |
| 
 | |
| 	for (i = 0; i < entry->nb_args; i++) {
 | |
| 		/* parameter types */
 | |
| 		if (trace_flags & TRACE_ITER_VERBOSE) {
 | |
| 			ret = trace_seq_printf(s, "%s ", entry->types[i]);
 | |
| 			if (!ret)
 | |
| 				return TRACE_TYPE_PARTIAL_LINE;
 | |
| 		}
 | |
| 		/* parameter values */
 | |
| 		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
 | |
| 				       trace->args[i],
 | |
| 				       i == entry->nb_args - 1 ? "" : ", ");
 | |
| 		if (!ret)
 | |
| 			return TRACE_TYPE_PARTIAL_LINE;
 | |
| 	}
 | |
| 
 | |
| 	ret = trace_seq_putc(s, ')');
 | |
| 	if (!ret)
 | |
| 		return TRACE_TYPE_PARTIAL_LINE;
 | |
| 
 | |
| end:
 | |
| 	ret =  trace_seq_putc(s, '\n');
 | |
| 	if (!ret)
 | |
| 		return TRACE_TYPE_PARTIAL_LINE;
 | |
| 
 | |
| 	return TRACE_TYPE_HANDLED;
 | |
| }
 | |
| 
 | |
| static enum print_line_t
 | |
| print_syscall_exit(struct trace_iterator *iter, int flags,
 | |
| 		   struct trace_event *event)
 | |
| {
 | |
| 	struct trace_seq *s = &iter->seq;
 | |
| 	struct trace_entry *ent = iter->ent;
 | |
| 	struct syscall_trace_exit *trace;
 | |
| 	int syscall;
 | |
| 	struct syscall_metadata *entry;
 | |
| 	int ret;
 | |
| 
 | |
| 	trace = (typeof(trace))ent;
 | |
| 	syscall = trace->nr;
 | |
| 	entry = syscall_nr_to_meta(syscall);
 | |
| 
 | |
| 	if (!entry) {
 | |
| 		trace_seq_printf(s, "\n");
 | |
| 		return TRACE_TYPE_HANDLED;
 | |
| 	}
 | |
| 
 | |
| 	if (entry->exit_event->event.type != ent->type) {
 | |
| 		WARN_ON_ONCE(1);
 | |
| 		return TRACE_TYPE_UNHANDLED;
 | |
| 	}
 | |
| 
 | |
| 	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
 | |
| 				trace->ret);
 | |
| 	if (!ret)
 | |
| 		return TRACE_TYPE_PARTIAL_LINE;
 | |
| 
 | |
| 	return TRACE_TYPE_HANDLED;
 | |
| }
 | |
| 
 | |
| extern char *__bad_type_size(void);
 | |
| 
 | |
| #define SYSCALL_FIELD(type, name)					\
 | |
| 	sizeof(type) != sizeof(trace.name) ?				\
 | |
| 		__bad_type_size() :					\
 | |
| 		#type, #name, offsetof(typeof(trace), name),		\
 | |
| 		sizeof(trace.name), is_signed_type(type)
 | |
| 
 | |
| static
 | |
| int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 | |
| {
 | |
| 	int i;
 | |
| 	int pos = 0;
 | |
| 
 | |
| 	/* When len=0, we just calculate the needed length */
 | |
| #define LEN_OR_ZERO (len ? len - pos : 0)
 | |
| 
 | |
| 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 | |
| 	for (i = 0; i < entry->nb_args; i++) {
 | |
| 		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
 | |
| 				entry->args[i], sizeof(unsigned long),
 | |
| 				i == entry->nb_args - 1 ? "" : ", ");
 | |
| 	}
 | |
| 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 | |
| 
 | |
| 	for (i = 0; i < entry->nb_args; i++) {
 | |
| 		pos += snprintf(buf + pos, LEN_OR_ZERO,
 | |
| 				", ((unsigned long)(REC->%s))", entry->args[i]);
 | |
| 	}
 | |
| 
 | |
| #undef LEN_OR_ZERO
 | |
| 
 | |
| 	/* return the length of print_fmt */
 | |
| 	return pos;
 | |
| }
 | |
| 
 | |
| static int set_syscall_print_fmt(struct ftrace_event_call *call)
 | |
| {
 | |
| 	char *print_fmt;
 | |
| 	int len;
 | |
| 	struct syscall_metadata *entry = call->data;
 | |
| 
 | |
| 	if (entry->enter_event != call) {
 | |
| 		call->print_fmt = "\"0x%lx\", REC->ret";
 | |
| 		return 0;
 | |
| 	}
 | |
| 
 | |
| 	/* First: called with 0 length to calculate the needed length */
 | |
| 	len = __set_enter_print_fmt(entry, NULL, 0);
 | |
| 
 | |
| 	print_fmt = kmalloc(len + 1, GFP_KERNEL);
 | |
| 	if (!print_fmt)
 | |
| 		return -ENOMEM;
 | |
| 
 | |
| 	/* Second: actually write the @print_fmt */
 | |
| 	__set_enter_print_fmt(entry, print_fmt, len + 1);
 | |
| 	call->print_fmt = print_fmt;
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static void free_syscall_print_fmt(struct ftrace_event_call *call)
 | |
| {
 | |
| 	struct syscall_metadata *entry = call->data;
 | |
| 
 | |
| 	if (entry->enter_event == call)
 | |
| 		kfree(call->print_fmt);
 | |
| }
 | |
| 
 | |
| static int syscall_enter_define_fields(struct ftrace_event_call *call)
 | |
| {
 | |
| 	struct syscall_trace_enter trace;
 | |
| 	struct syscall_metadata *meta = call->data;
 | |
| 	int ret;
 | |
| 	int i;
 | |
| 	int offset = offsetof(typeof(trace), args);
 | |
| 
 | |
| 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
 | |
| 	if (ret)
 | |
| 		return ret;
 | |
| 
 | |
| 	for (i = 0; i < meta->nb_args; i++) {
 | |
| 		ret = trace_define_field(call, meta->types[i],
 | |
| 					 meta->args[i], offset,
 | |
| 					 sizeof(unsigned long), 0,
 | |
| 					 FILTER_OTHER);
 | |
| 		offset += sizeof(unsigned long);
 | |
| 	}
 | |
| 
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static int syscall_exit_define_fields(struct ftrace_event_call *call)
 | |
| {
 | |
| 	struct syscall_trace_exit trace;
 | |
| 	int ret;
 | |
| 
 | |
| 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
 | |
| 	if (ret)
 | |
| 		return ret;
 | |
| 
 | |
| 	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
 | |
| 				 FILTER_OTHER);
 | |
| 
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 | |
| {
 | |
| 	struct syscall_trace_enter *entry;
 | |
| 	struct syscall_metadata *sys_data;
 | |
| 	struct ring_buffer_event *event;
 | |
| 	struct ring_buffer *buffer;
 | |
| 	int syscall_nr;
 | |
| 	int size;
 | |
| 
 | |
| 	syscall_nr = trace_get_syscall_nr(current, regs);
 | |
| 	if (syscall_nr < 0)
 | |
| 		return;
 | |
| 	if (!test_bit(syscall_nr, enabled_enter_syscalls))
 | |
| 		return;
 | |
| 
 | |
| 	sys_data = syscall_nr_to_meta(syscall_nr);
 | |
| 	if (!sys_data)
 | |
| 		return;
 | |
| 
 | |
| 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 | |
| 
 | |
| 	event = trace_current_buffer_lock_reserve(&buffer,
 | |
| 			sys_data->enter_event->event.type, size, 0, 0);
 | |
| 	if (!event)
 | |
| 		return;
 | |
| 
 | |
| 	entry = ring_buffer_event_data(event);
 | |
| 	entry->nr = syscall_nr;
 | |
| 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
 | |
| 
 | |
| 	if (!filter_current_check_discard(buffer, sys_data->enter_event,
 | |
| 					  entry, event))
 | |
| 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 | |
| }
 | |
| 
 | |
| static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 | |
| {
 | |
| 	struct syscall_trace_exit *entry;
 | |
| 	struct syscall_metadata *sys_data;
 | |
| 	struct ring_buffer_event *event;
 | |
| 	struct ring_buffer *buffer;
 | |
| 	int syscall_nr;
 | |
| 
 | |
| 	syscall_nr = trace_get_syscall_nr(current, regs);
 | |
| 	if (syscall_nr < 0)
 | |
| 		return;
 | |
| 	if (!test_bit(syscall_nr, enabled_exit_syscalls))
 | |
| 		return;
 | |
| 
 | |
| 	sys_data = syscall_nr_to_meta(syscall_nr);
 | |
| 	if (!sys_data)
 | |
| 		return;
 | |
| 
 | |
| 	event = trace_current_buffer_lock_reserve(&buffer,
 | |
| 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 | |
| 	if (!event)
 | |
| 		return;
 | |
| 
 | |
| 	entry = ring_buffer_event_data(event);
 | |
| 	entry->nr = syscall_nr;
 | |
| 	entry->ret = syscall_get_return_value(current, regs);
 | |
| 
 | |
| 	if (!filter_current_check_discard(buffer, sys_data->exit_event,
 | |
| 					  entry, event))
 | |
| 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 | |
| }
 | |
| 
 | |
| static int reg_event_syscall_enter(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int ret = 0;
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 | |
| 		return -ENOSYS;
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	if (!sys_refcount_enter)
 | |
| 		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
 | |
| 	if (!ret) {
 | |
| 		set_bit(num, enabled_enter_syscalls);
 | |
| 		sys_refcount_enter++;
 | |
| 	}
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void unreg_event_syscall_enter(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 | |
| 		return;
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	sys_refcount_enter--;
 | |
| 	clear_bit(num, enabled_enter_syscalls);
 | |
| 	if (!sys_refcount_enter)
 | |
| 		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| }
 | |
| 
 | |
| static int reg_event_syscall_exit(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int ret = 0;
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 | |
| 		return -ENOSYS;
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	if (!sys_refcount_exit)
 | |
| 		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
 | |
| 	if (!ret) {
 | |
| 		set_bit(num, enabled_exit_syscalls);
 | |
| 		sys_refcount_exit++;
 | |
| 	}
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void unreg_event_syscall_exit(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 | |
| 		return;
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	sys_refcount_exit--;
 | |
| 	clear_bit(num, enabled_exit_syscalls);
 | |
| 	if (!sys_refcount_exit)
 | |
| 		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| }
 | |
| 
 | |
| static int init_syscall_trace(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int id;
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 	if (num < 0 || num >= NR_syscalls) {
 | |
| 		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
 | |
| 				((struct syscall_metadata *)call->data)->name);
 | |
| 		return -ENOSYS;
 | |
| 	}
 | |
| 
 | |
| 	if (set_syscall_print_fmt(call) < 0)
 | |
| 		return -ENOMEM;
 | |
| 
 | |
| 	id = trace_event_raw_init(call);
 | |
| 
 | |
| 	if (id < 0) {
 | |
| 		free_syscall_print_fmt(call);
 | |
| 		return id;
 | |
| 	}
 | |
| 
 | |
| 	return id;
 | |
| }
 | |
| 
 | |
| struct trace_event_functions enter_syscall_print_funcs = {
 | |
| 	.trace		= print_syscall_enter,
 | |
| };
 | |
| 
 | |
| struct trace_event_functions exit_syscall_print_funcs = {
 | |
| 	.trace		= print_syscall_exit,
 | |
| };
 | |
| 
 | |
| struct ftrace_event_class event_class_syscall_enter = {
 | |
| 	.system		= "syscalls",
 | |
| 	.reg		= syscall_enter_register,
 | |
| 	.define_fields	= syscall_enter_define_fields,
 | |
| 	.get_fields	= syscall_get_enter_fields,
 | |
| 	.raw_init	= init_syscall_trace,
 | |
| };
 | |
| 
 | |
| struct ftrace_event_class event_class_syscall_exit = {
 | |
| 	.system		= "syscalls",
 | |
| 	.reg		= syscall_exit_register,
 | |
| 	.define_fields	= syscall_exit_define_fields,
 | |
| 	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
 | |
| 	.raw_init	= init_syscall_trace,
 | |
| };
 | |
| 
 | |
| unsigned long __init __weak arch_syscall_addr(int nr)
 | |
| {
 | |
| 	return (unsigned long)sys_call_table[nr];
 | |
| }
 | |
| 
 | |
| static int __init init_ftrace_syscalls(void)
 | |
| {
 | |
| 	struct syscall_metadata *meta;
 | |
| 	unsigned long addr;
 | |
| 	int i;
 | |
| 
 | |
| 	syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
 | |
| 				    GFP_KERNEL);
 | |
| 	if (!syscalls_metadata) {
 | |
| 		WARN_ON(1);
 | |
| 		return -ENOMEM;
 | |
| 	}
 | |
| 
 | |
| 	for (i = 0; i < NR_syscalls; i++) {
 | |
| 		addr = arch_syscall_addr(i);
 | |
| 		meta = find_syscall_meta(addr);
 | |
| 		if (!meta)
 | |
| 			continue;
 | |
| 
 | |
| 		meta->syscall_nr = i;
 | |
| 		syscalls_metadata[i] = meta;
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| early_initcall(init_ftrace_syscalls);
 | |
| 
 | |
| #ifdef CONFIG_PERF_EVENTS
 | |
| 
 | |
| static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
 | |
| static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 | |
| static int sys_perf_refcount_enter;
 | |
| static int sys_perf_refcount_exit;
 | |
| 
 | |
| static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 | |
| {
 | |
| 	struct syscall_metadata *sys_data;
 | |
| 	struct syscall_trace_enter *rec;
 | |
| 	struct hlist_head *head;
 | |
| 	int syscall_nr;
 | |
| 	int rctx;
 | |
| 	int size;
 | |
| 
 | |
| 	syscall_nr = trace_get_syscall_nr(current, regs);
 | |
| 	if (syscall_nr < 0)
 | |
| 		return;
 | |
| 	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
 | |
| 		return;
 | |
| 
 | |
| 	sys_data = syscall_nr_to_meta(syscall_nr);
 | |
| 	if (!sys_data)
 | |
| 		return;
 | |
| 
 | |
| 	/* get the size after alignment with the u32 buffer size field */
 | |
| 	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 | |
| 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 | |
| 	size -= sizeof(u32);
 | |
| 
 | |
| 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 | |
| 		      "perf buffer not large enough"))
 | |
| 		return;
 | |
| 
 | |
| 	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
 | |
| 				sys_data->enter_event->event.type, regs, &rctx);
 | |
| 	if (!rec)
 | |
| 		return;
 | |
| 
 | |
| 	rec->nr = syscall_nr;
 | |
| 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 | |
| 			       (unsigned long *)&rec->args);
 | |
| 
 | |
| 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
 | |
| 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 | |
| }
 | |
| 
 | |
| static int perf_sysenter_enable(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int ret = 0;
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	if (!sys_perf_refcount_enter)
 | |
| 		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
 | |
| 	if (ret) {
 | |
| 		pr_info("event trace: Could not activate"
 | |
| 				"syscall entry trace point");
 | |
| 	} else {
 | |
| 		set_bit(num, enabled_perf_enter_syscalls);
 | |
| 		sys_perf_refcount_enter++;
 | |
| 	}
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void perf_sysenter_disable(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	sys_perf_refcount_enter--;
 | |
| 	clear_bit(num, enabled_perf_enter_syscalls);
 | |
| 	if (!sys_perf_refcount_enter)
 | |
| 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| }
 | |
| 
 | |
| static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 | |
| {
 | |
| 	struct syscall_metadata *sys_data;
 | |
| 	struct syscall_trace_exit *rec;
 | |
| 	struct hlist_head *head;
 | |
| 	int syscall_nr;
 | |
| 	int rctx;
 | |
| 	int size;
 | |
| 
 | |
| 	syscall_nr = trace_get_syscall_nr(current, regs);
 | |
| 	if (syscall_nr < 0)
 | |
| 		return;
 | |
| 	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
 | |
| 		return;
 | |
| 
 | |
| 	sys_data = syscall_nr_to_meta(syscall_nr);
 | |
| 	if (!sys_data)
 | |
| 		return;
 | |
| 
 | |
| 	/* We can probably do that at build time */
 | |
| 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 | |
| 	size -= sizeof(u32);
 | |
| 
 | |
| 	/*
 | |
| 	 * Impossible, but be paranoid with the future
 | |
| 	 * How to put this check outside runtime?
 | |
| 	 */
 | |
| 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 | |
| 		"exit event has grown above perf buffer size"))
 | |
| 		return;
 | |
| 
 | |
| 	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
 | |
| 				sys_data->exit_event->event.type, regs, &rctx);
 | |
| 	if (!rec)
 | |
| 		return;
 | |
| 
 | |
| 	rec->nr = syscall_nr;
 | |
| 	rec->ret = syscall_get_return_value(current, regs);
 | |
| 
 | |
| 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
 | |
| 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 | |
| }
 | |
| 
 | |
| static int perf_sysexit_enable(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int ret = 0;
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	if (!sys_perf_refcount_exit)
 | |
| 		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
 | |
| 	if (ret) {
 | |
| 		pr_info("event trace: Could not activate"
 | |
| 				"syscall exit trace point");
 | |
| 	} else {
 | |
| 		set_bit(num, enabled_perf_exit_syscalls);
 | |
| 		sys_perf_refcount_exit++;
 | |
| 	}
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void perf_sysexit_disable(struct ftrace_event_call *call)
 | |
| {
 | |
| 	int num;
 | |
| 
 | |
| 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 | |
| 
 | |
| 	mutex_lock(&syscall_trace_lock);
 | |
| 	sys_perf_refcount_exit--;
 | |
| 	clear_bit(num, enabled_perf_exit_syscalls);
 | |
| 	if (!sys_perf_refcount_exit)
 | |
| 		unregister_trace_sys_exit(perf_syscall_exit, NULL);
 | |
| 	mutex_unlock(&syscall_trace_lock);
 | |
| }
 | |
| 
 | |
| #endif /* CONFIG_PERF_EVENTS */
 | |
| 
 | |
| static int syscall_enter_register(struct ftrace_event_call *event,
 | |
| 				 enum trace_reg type, void *data)
 | |
| {
 | |
| 	switch (type) {
 | |
| 	case TRACE_REG_REGISTER:
 | |
| 		return reg_event_syscall_enter(event);
 | |
| 	case TRACE_REG_UNREGISTER:
 | |
| 		unreg_event_syscall_enter(event);
 | |
| 		return 0;
 | |
| 
 | |
| #ifdef CONFIG_PERF_EVENTS
 | |
| 	case TRACE_REG_PERF_REGISTER:
 | |
| 		return perf_sysenter_enable(event);
 | |
| 	case TRACE_REG_PERF_UNREGISTER:
 | |
| 		perf_sysenter_disable(event);
 | |
| 		return 0;
 | |
| 	case TRACE_REG_PERF_OPEN:
 | |
| 	case TRACE_REG_PERF_CLOSE:
 | |
| 	case TRACE_REG_PERF_ADD:
 | |
| 	case TRACE_REG_PERF_DEL:
 | |
| 		return 0;
 | |
| #endif
 | |
| 	}
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static int syscall_exit_register(struct ftrace_event_call *event,
 | |
| 				 enum trace_reg type, void *data)
 | |
| {
 | |
| 	switch (type) {
 | |
| 	case TRACE_REG_REGISTER:
 | |
| 		return reg_event_syscall_exit(event);
 | |
| 	case TRACE_REG_UNREGISTER:
 | |
| 		unreg_event_syscall_exit(event);
 | |
| 		return 0;
 | |
| 
 | |
| #ifdef CONFIG_PERF_EVENTS
 | |
| 	case TRACE_REG_PERF_REGISTER:
 | |
| 		return perf_sysexit_enable(event);
 | |
| 	case TRACE_REG_PERF_UNREGISTER:
 | |
| 		perf_sysexit_disable(event);
 | |
| 		return 0;
 | |
| 	case TRACE_REG_PERF_OPEN:
 | |
| 	case TRACE_REG_PERF_CLOSE:
 | |
| 	case TRACE_REG_PERF_ADD:
 | |
| 	case TRACE_REG_PERF_DEL:
 | |
| 		return 0;
 | |
| #endif
 | |
| 	}
 | |
| 	return 0;
 | |
| }
 |