 500c2e1fdb
			
		
	
	
	500c2e1fdb
	
	
	
		
			
			The current locking mechanism uses a ll/sc sequence to release a spinlock. This is slower than a wmb() followed by a store to unlock. The branching forward to .subsection 2 on sc failure slows down the contended case. So we get rid of that part too. Since we are now working on naturally aligned u16 values, we can get rid of a masking operation as the LHU already does the right thing. The ANDI are reversed for better scheduling on multi-issue CPUs On a 12 CPU 750MHz Octeon cn5750 this patch improves ipv4 UDP packet forwarding rates from 3.58*10^6 PPS to 3.99*10^6 PPS, or about 11%. Signed-off-by: David Daney <ddaney@caviumnetworks.com> To: linux-mips@linux-mips.org Patchwork: http://patchwork.linux-mips.org/patch/937/ Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
		
			
				
	
	
		
			181 lines
		
	
	
	
		
			4.9 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			181 lines
		
	
	
	
		
			4.9 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * This file is subject to the terms and conditions of the GNU General Public
 | |
|  * License.  See the file "COPYING" in the main directory of this archive
 | |
|  * for more details.
 | |
|  *
 | |
|  * Copyright (C) 2006 by Ralf Baechle (ralf@linux-mips.org)
 | |
|  */
 | |
| #ifndef __ASM_BARRIER_H
 | |
| #define __ASM_BARRIER_H
 | |
| 
 | |
| /*
 | |
|  * read_barrier_depends - Flush all pending reads that subsequents reads
 | |
|  * depend on.
 | |
|  *
 | |
|  * No data-dependent reads from memory-like regions are ever reordered
 | |
|  * over this barrier.  All reads preceding this primitive are guaranteed
 | |
|  * to access memory (but not necessarily other CPUs' caches) before any
 | |
|  * reads following this primitive that depend on the data return by
 | |
|  * any of the preceding reads.  This primitive is much lighter weight than
 | |
|  * rmb() on most CPUs, and is never heavier weight than is
 | |
|  * rmb().
 | |
|  *
 | |
|  * These ordering constraints are respected by both the local CPU
 | |
|  * and the compiler.
 | |
|  *
 | |
|  * Ordering is not guaranteed by anything other than these primitives,
 | |
|  * not even by data dependencies.  See the documentation for
 | |
|  * memory_barrier() for examples and URLs to more information.
 | |
|  *
 | |
|  * For example, the following code would force ordering (the initial
 | |
|  * value of "a" is zero, "b" is one, and "p" is "&a"):
 | |
|  *
 | |
|  * <programlisting>
 | |
|  *	CPU 0				CPU 1
 | |
|  *
 | |
|  *	b = 2;
 | |
|  *	memory_barrier();
 | |
|  *	p = &b;				q = p;
 | |
|  *					read_barrier_depends();
 | |
|  *					d = *q;
 | |
|  * </programlisting>
 | |
|  *
 | |
|  * because the read of "*q" depends on the read of "p" and these
 | |
|  * two reads are separated by a read_barrier_depends().  However,
 | |
|  * the following code, with the same initial values for "a" and "b":
 | |
|  *
 | |
|  * <programlisting>
 | |
|  *	CPU 0				CPU 1
 | |
|  *
 | |
|  *	a = 2;
 | |
|  *	memory_barrier();
 | |
|  *	b = 3;				y = b;
 | |
|  *					read_barrier_depends();
 | |
|  *					x = a;
 | |
|  * </programlisting>
 | |
|  *
 | |
|  * does not enforce ordering, since there is no data dependency between
 | |
|  * the read of "a" and the read of "b".  Therefore, on some CPUs, such
 | |
|  * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
 | |
|  * in cases like this where there are no data dependencies.
 | |
|  */
 | |
| 
 | |
| #define read_barrier_depends()		do { } while(0)
 | |
| #define smp_read_barrier_depends()	do { } while(0)
 | |
| 
 | |
| #ifdef CONFIG_CPU_HAS_SYNC
 | |
| #define __sync()				\
 | |
| 	__asm__ __volatile__(			\
 | |
| 		".set	push\n\t"		\
 | |
| 		".set	noreorder\n\t"		\
 | |
| 		".set	mips2\n\t"		\
 | |
| 		"sync\n\t"			\
 | |
| 		".set	pop"			\
 | |
| 		: /* no output */		\
 | |
| 		: /* no input */		\
 | |
| 		: "memory")
 | |
| #else
 | |
| #define __sync()	do { } while(0)
 | |
| #endif
 | |
| 
 | |
| #define __fast_iob()				\
 | |
| 	__asm__ __volatile__(			\
 | |
| 		".set	push\n\t"		\
 | |
| 		".set	noreorder\n\t"		\
 | |
| 		"lw	$0,%0\n\t"		\
 | |
| 		"nop\n\t"			\
 | |
| 		".set	pop"			\
 | |
| 		: /* no output */		\
 | |
| 		: "m" (*(int *)CKSEG1)		\
 | |
| 		: "memory")
 | |
| #ifdef CONFIG_CPU_CAVIUM_OCTEON
 | |
| # define OCTEON_SYNCW_STR	".set push\n.set arch=octeon\nsyncw\nsyncw\n.set pop\n"
 | |
| # define __syncw() 	__asm__ __volatile__(OCTEON_SYNCW_STR : : : "memory")
 | |
| 
 | |
| # define fast_wmb()	__syncw()
 | |
| # define fast_rmb()	barrier()
 | |
| # define fast_mb()	__sync()
 | |
| # define fast_iob()	do { } while (0)
 | |
| #else /* ! CONFIG_CPU_CAVIUM_OCTEON */
 | |
| # define fast_wmb()	__sync()
 | |
| # define fast_rmb()	__sync()
 | |
| # define fast_mb()	__sync()
 | |
| # ifdef CONFIG_SGI_IP28
 | |
| #  define fast_iob()				\
 | |
| 	__asm__ __volatile__(			\
 | |
| 		".set	push\n\t"		\
 | |
| 		".set	noreorder\n\t"		\
 | |
| 		"lw	$0,%0\n\t"		\
 | |
| 		"sync\n\t"			\
 | |
| 		"lw	$0,%0\n\t"		\
 | |
| 		".set	pop"			\
 | |
| 		: /* no output */		\
 | |
| 		: "m" (*(int *)CKSEG1ADDR(0x1fa00004)) \
 | |
| 		: "memory")
 | |
| # else
 | |
| #  define fast_iob()				\
 | |
| 	do {					\
 | |
| 		__sync();			\
 | |
| 		__fast_iob();			\
 | |
| 	} while (0)
 | |
| # endif
 | |
| #endif /* CONFIG_CPU_CAVIUM_OCTEON */
 | |
| 
 | |
| #ifdef CONFIG_CPU_HAS_WB
 | |
| 
 | |
| #include <asm/wbflush.h>
 | |
| 
 | |
| #define wmb()		fast_wmb()
 | |
| #define rmb()		fast_rmb()
 | |
| #define mb()		wbflush()
 | |
| #define iob()		wbflush()
 | |
| 
 | |
| #else /* !CONFIG_CPU_HAS_WB */
 | |
| 
 | |
| #define wmb()		fast_wmb()
 | |
| #define rmb()		fast_rmb()
 | |
| #define mb()		fast_mb()
 | |
| #define iob()		fast_iob()
 | |
| 
 | |
| #endif /* !CONFIG_CPU_HAS_WB */
 | |
| 
 | |
| #if defined(CONFIG_WEAK_ORDERING) && defined(CONFIG_SMP)
 | |
| # ifdef CONFIG_CPU_CAVIUM_OCTEON
 | |
| #  define smp_mb()	__sync()
 | |
| #  define smp_rmb()	barrier()
 | |
| #  define smp_wmb()	__syncw()
 | |
| # else
 | |
| #  define smp_mb()	__asm__ __volatile__("sync" : : :"memory")
 | |
| #  define smp_rmb()	__asm__ __volatile__("sync" : : :"memory")
 | |
| #  define smp_wmb()	__asm__ __volatile__("sync" : : :"memory")
 | |
| # endif
 | |
| #else
 | |
| #define smp_mb()	barrier()
 | |
| #define smp_rmb()	barrier()
 | |
| #define smp_wmb()	barrier()
 | |
| #endif
 | |
| 
 | |
| #if defined(CONFIG_WEAK_REORDERING_BEYOND_LLSC) && defined(CONFIG_SMP)
 | |
| #define __WEAK_LLSC_MB		"       sync	\n"
 | |
| #else
 | |
| #define __WEAK_LLSC_MB		"		\n"
 | |
| #endif
 | |
| 
 | |
| #define set_mb(var, value) \
 | |
| 	do { var = value; smp_mb(); } while (0)
 | |
| 
 | |
| #define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
 | |
| 
 | |
| #ifdef CONFIG_CPU_CAVIUM_OCTEON
 | |
| #define smp_mb__before_llsc() smp_wmb()
 | |
| /* Cause previous writes to become visible on all CPUs as soon as possible */
 | |
| #define nudge_writes() __asm__ __volatile__(".set push\n\t"		\
 | |
| 					    ".set arch=octeon\n\t"	\
 | |
| 					    "syncw\n\t"			\
 | |
| 					    ".set pop" : : : "memory")
 | |
| #else
 | |
| #define smp_mb__before_llsc() smp_llsc_mb()
 | |
| #define nudge_writes() mb()
 | |
| #endif
 | |
| 
 | |
| #endif /* __ASM_BARRIER_H */
 |