On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
		
			
				
	
	
		
			27 lines
		
	
	
	
		
			677 B
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			27 lines
		
	
	
	
		
			677 B
			
		
	
	
	
		
			C
		
	
	
	
	
	
#ifndef _ASM_X86_XOR_64_H
 | 
						|
#define _ASM_X86_XOR_64_H
 | 
						|
 | 
						|
static struct xor_block_template xor_block_sse = {
 | 
						|
	.name = "generic_sse",
 | 
						|
	.do_2 = xor_sse_2,
 | 
						|
	.do_3 = xor_sse_3,
 | 
						|
	.do_4 = xor_sse_4,
 | 
						|
	.do_5 = xor_sse_5,
 | 
						|
};
 | 
						|
 | 
						|
 | 
						|
/* Also try the AVX routines */
 | 
						|
#include <asm/xor_avx.h>
 | 
						|
 | 
						|
/* We force the use of the SSE xor block because it can write around L2.
 | 
						|
   We may also be able to load into the L1 only depending on how the cpu
 | 
						|
   deals with a load to a line that is being prefetched.  */
 | 
						|
#undef XOR_TRY_TEMPLATES
 | 
						|
#define XOR_TRY_TEMPLATES			\
 | 
						|
do {						\
 | 
						|
	AVX_XOR_SPEED;				\
 | 
						|
	xor_speed(&xor_block_sse_pf64);		\
 | 
						|
	xor_speed(&xor_block_sse);		\
 | 
						|
} while (0)
 | 
						|
 | 
						|
#endif /* _ASM_X86_XOR_64_H */
 |