 f317820cb6
			
		
	
	
	f317820cb6
	
	
	
		
			
			On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
		
			
				
	
	
		
			27 lines
		
	
	
	
		
			677 B
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			27 lines
		
	
	
	
		
			677 B
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _ASM_X86_XOR_64_H
 | |
| #define _ASM_X86_XOR_64_H
 | |
| 
 | |
| static struct xor_block_template xor_block_sse = {
 | |
| 	.name = "generic_sse",
 | |
| 	.do_2 = xor_sse_2,
 | |
| 	.do_3 = xor_sse_3,
 | |
| 	.do_4 = xor_sse_4,
 | |
| 	.do_5 = xor_sse_5,
 | |
| };
 | |
| 
 | |
| 
 | |
| /* Also try the AVX routines */
 | |
| #include <asm/xor_avx.h>
 | |
| 
 | |
| /* We force the use of the SSE xor block because it can write around L2.
 | |
|    We may also be able to load into the L1 only depending on how the cpu
 | |
|    deals with a load to a line that is being prefetched.  */
 | |
| #undef XOR_TRY_TEMPLATES
 | |
| #define XOR_TRY_TEMPLATES			\
 | |
| do {						\
 | |
| 	AVX_XOR_SPEED;				\
 | |
| 	xor_speed(&xor_block_sse_pf64);		\
 | |
| 	xor_speed(&xor_block_sse);		\
 | |
| } while (0)
 | |
| 
 | |
| #endif /* _ASM_X86_XOR_64_H */
 |