 cede88418b
			
		
	
	
	cede88418b
	
	
	
		
			
			The rtmutex code is the only user of __HAVE_ARCH_CMPXCHG and we have a few other user of cmpxchg() which do not care about __HAVE_ARCH_CMPXCHG. This define was first introduced in23f78d4a0("[PATCH] pi-futex: rt mutex core") which is v2.6.18. The generic cmpxchg was introduced later in068fbad288("Add cmpxchg_local to asm-generic for per cpu atomic operations") which is v2.6.25. Back then something was required to get rtmutex working with the fast path on architectures without cmpxchg and this seems to be the result. It popped up recently on rt-users because ARM (v6+) does not define __HAVE_ARCH_CMPXCHG (even that it implements it) which results in slower locking performance in the fast path. To put some numbers on it: preempt -RT, am335x, 10 loops of 100000 invocations of rt_spin_lock() + rt_spin_unlock() (time "total" is the average of the 10 loops for the 100000 invocations, "loop" is "total / 100000 * 1000"): cmpxchg | slowpath used || cmpxchg used | total | loop || total | loop --------|-----------|-------||------------|------- ARMv6 | 9129.4 us | 91 ns || 3311.9 us | 33 ns generic | 9360.2 us | 94 ns || 10834.6 us | 108 ns ----------------------------||-------------------- Forcing it to generic cmpxchg() made things worse for the slowpath and even worse in cmpxchg() path. It boils down to 14ns more per lock+unlock in a cache hot loop so it might not be that much in real world. The last test was a substitute for pre ARMv6 machine but then I was able to perform the comparison on imx28 which is ARMv5 and therefore is always is using the generic cmpxchg implementation. And the numbers: | total | loop -------- |----------- |-------- slowpath | 263937.2 us | 2639 ns cmpxchg | 16934.2 us | 169 ns -------------------------------- The numbers are larger since the machine is slower in general. However, letting rtmutex use cmpxchg() instead the slowpath seem to improve things. Since from the ARM (tested on am335x + imx28) point of view always using cmpxchg() in rt_mutex_lock() + rt_mutex_unlock() makes sense I would drop the define. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: will.deacon@arm.com Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20150225175613.GE6823@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
		
			
				
	
	
		
			105 lines
		
	
	
	
		
			2.2 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			105 lines
		
	
	
	
		
			2.2 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Generic UP xchg and cmpxchg using interrupt disablement.  Does not
 | |
|  * support SMP.
 | |
|  */
 | |
| 
 | |
| #ifndef __ASM_GENERIC_CMPXCHG_H
 | |
| #define __ASM_GENERIC_CMPXCHG_H
 | |
| 
 | |
| #ifdef CONFIG_SMP
 | |
| #error "Cannot use generic cmpxchg on SMP"
 | |
| #endif
 | |
| 
 | |
| #include <linux/types.h>
 | |
| #include <linux/irqflags.h>
 | |
| 
 | |
| #ifndef xchg
 | |
| 
 | |
| /*
 | |
|  * This function doesn't exist, so you'll get a linker error if
 | |
|  * something tries to do an invalidly-sized xchg().
 | |
|  */
 | |
| extern void __xchg_called_with_bad_pointer(void);
 | |
| 
 | |
| static inline
 | |
| unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
 | |
| {
 | |
| 	unsigned long ret, flags;
 | |
| 
 | |
| 	switch (size) {
 | |
| 	case 1:
 | |
| #ifdef __xchg_u8
 | |
| 		return __xchg_u8(x, ptr);
 | |
| #else
 | |
| 		local_irq_save(flags);
 | |
| 		ret = *(volatile u8 *)ptr;
 | |
| 		*(volatile u8 *)ptr = x;
 | |
| 		local_irq_restore(flags);
 | |
| 		return ret;
 | |
| #endif /* __xchg_u8 */
 | |
| 
 | |
| 	case 2:
 | |
| #ifdef __xchg_u16
 | |
| 		return __xchg_u16(x, ptr);
 | |
| #else
 | |
| 		local_irq_save(flags);
 | |
| 		ret = *(volatile u16 *)ptr;
 | |
| 		*(volatile u16 *)ptr = x;
 | |
| 		local_irq_restore(flags);
 | |
| 		return ret;
 | |
| #endif /* __xchg_u16 */
 | |
| 
 | |
| 	case 4:
 | |
| #ifdef __xchg_u32
 | |
| 		return __xchg_u32(x, ptr);
 | |
| #else
 | |
| 		local_irq_save(flags);
 | |
| 		ret = *(volatile u32 *)ptr;
 | |
| 		*(volatile u32 *)ptr = x;
 | |
| 		local_irq_restore(flags);
 | |
| 		return ret;
 | |
| #endif /* __xchg_u32 */
 | |
| 
 | |
| #ifdef CONFIG_64BIT
 | |
| 	case 8:
 | |
| #ifdef __xchg_u64
 | |
| 		return __xchg_u64(x, ptr);
 | |
| #else
 | |
| 		local_irq_save(flags);
 | |
| 		ret = *(volatile u64 *)ptr;
 | |
| 		*(volatile u64 *)ptr = x;
 | |
| 		local_irq_restore(flags);
 | |
| 		return ret;
 | |
| #endif /* __xchg_u64 */
 | |
| #endif /* CONFIG_64BIT */
 | |
| 
 | |
| 	default:
 | |
| 		__xchg_called_with_bad_pointer();
 | |
| 		return x;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| #define xchg(ptr, x) \
 | |
| 	((__typeof__(*(ptr))) __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
 | |
| 
 | |
| #endif /* xchg */
 | |
| 
 | |
| /*
 | |
|  * Atomic compare and exchange.
 | |
|  */
 | |
| #include <asm-generic/cmpxchg-local.h>
 | |
| 
 | |
| #ifndef cmpxchg_local
 | |
| #define cmpxchg_local(ptr, o, n)				  	       \
 | |
| 	((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), (unsigned long)(o),\
 | |
| 			(unsigned long)(n), sizeof(*(ptr))))
 | |
| #endif
 | |
| 
 | |
| #ifndef cmpxchg64_local
 | |
| #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 | |
| #endif
 | |
| 
 | |
| #define cmpxchg(ptr, o, n)	cmpxchg_local((ptr), (o), (n))
 | |
| #define cmpxchg64(ptr, o, n)	cmpxchg64_local((ptr), (o), (n))
 | |
| 
 | |
| #endif /* __ASM_GENERIC_CMPXCHG_H */
 |