percpu: add optimized generic percpu accessors

It is an optimization and a cleanup, and adds the following new
generic percpu methods:

  percpu_read()
  percpu_write()
  percpu_add()
  percpu_sub()
  percpu_and()
  percpu_or()
  percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

 return __get_cpu_var(var);

 ffffffff8102ca2b:	48 8b 14 fd 80 09 74 	mov    -0x7e8bf680(,%rdi,8),%rdx
 ffffffff8102ca32:	81
 ffffffff8102ca33:	48 c7 c0 d8 59 00 00 	mov    $0x59d8,%rax
 ffffffff8102ca3a:	48 8b 04 10          	mov    (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

 return percpu_read(var);

 ffffffff8102ca3f:	65 48 8b 05 91 8f fd 	mov    %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
    * added percpu_and() for completeness's sake
    * made generic percpu ops atomic against preemption

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Ingo Molnar 2009-01-15 22:15:53 +09:00
parent 004aa322f8
commit 6dbde35308
15 changed files with 102 additions and 48 deletions

View file

@ -40,16 +40,11 @@
#ifdef CONFIG_SMP
#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset x86_read_percpu(this_cpu_off)
#define __my_cpu_offset percpu_read(this_cpu_off)
#else
#define __percpu_seg_str
#endif
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@ -115,11 +110,13 @@ do { \
ret__; \
})
#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
@ -131,6 +128,11 @@ do { \
old__; \
})
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else