| 
									
										
										
										
											2011-08-04 20:19:25 +02:00
										 |  |  | /* | 
					
						
							|  |  |  |  * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental | 
					
						
							|  |  |  |  * SSE3 instruction set extensions introduced in Intel Core Microarchitecture | 
					
						
							|  |  |  |  * processors. CPUs supporting Intel(R) AVX extensions will get an additional | 
					
						
							|  |  |  |  * boost. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This work was inspired by the vectorized implementation of Dean Gaudet. | 
					
						
							|  |  |  |  * Additional information on it can be found at: | 
					
						
							|  |  |  |  *    http://www.arctic.org/~dean/crypto/sha1.html | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * It was improved upon with more efficient vectorization of the message | 
					
						
							|  |  |  |  * scheduling. This implementation has also been optimized for all current and | 
					
						
							|  |  |  |  * several future generations of Intel CPUs. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * See this article for more information about the implementation details: | 
					
						
							|  |  |  |  *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Copyright (C) 2010, Intel Corp. | 
					
						
							|  |  |  |  *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 | 
					
						
							|  |  |  |  *            Ronen Zohar <ronen.zohar@intel.com>
 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: | 
					
						
							|  |  |  |  *   Author: Mathias Krause <minipli@googlemail.com>
 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is free software; you can redistribute it and/or modify
 | 
					
						
							|  |  |  |  * it under the terms of the GNU General Public License as published by | 
					
						
							|  |  |  |  * the Free Software Foundation; either version 2 of the License, or
 | 
					
						
							|  |  |  |  * (at your option) any later version. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-01-19 13:39:41 +02:00
										 |  |  | #include <linux/linkage.h> | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-08-04 20:19:25 +02:00
										 |  |  | #define CTX	%rdi	// arg1 | 
					
						
							|  |  |  | #define BUF	%rsi	// arg2 | 
					
						
							|  |  |  | #define CNT	%rdx	// arg3 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define REG_A	%ecx | 
					
						
							|  |  |  | #define REG_B	%esi | 
					
						
							|  |  |  | #define REG_C	%edi | 
					
						
							|  |  |  | #define REG_D	%ebp | 
					
						
							|  |  |  | #define REG_E	%edx | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define REG_T1	%eax | 
					
						
							|  |  |  | #define REG_T2	%ebx | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define K_BASE		%r8 | 
					
						
							|  |  |  | #define HASH_PTR	%r9 | 
					
						
							|  |  |  | #define BUFFER_PTR	%r10 | 
					
						
							|  |  |  | #define BUFFER_END	%r11 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define W_TMP1	%xmm0 | 
					
						
							|  |  |  | #define W_TMP2	%xmm9 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define W0	%xmm1 | 
					
						
							|  |  |  | #define W4	%xmm2 | 
					
						
							|  |  |  | #define W8	%xmm3 | 
					
						
							|  |  |  | #define W12	%xmm4 | 
					
						
							|  |  |  | #define W16	%xmm5 | 
					
						
							|  |  |  | #define W20	%xmm6 | 
					
						
							|  |  |  | #define W24	%xmm7 | 
					
						
							|  |  |  | #define W28	%xmm8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define XMM_SHUFB_BSWAP	%xmm10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ | 
					
						
							|  |  |  | #define WK(t)	(((t) & 15) * 4)(%rsp) | 
					
						
							|  |  |  | #define W_PRECALC_AHEAD	16 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* | 
					
						
							|  |  |  |  * This macro implements the SHA-1 function's body for single 64-byte block | 
					
						
							|  |  |  |  * param: function's name | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | .macro SHA1_VECTOR_ASM  name | 
					
						
							| 
									
										
										
										
											2013-01-19 13:39:41 +02:00
										 |  |  | 	ENTRY(\name) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-08-04 20:19:25 +02:00
										 |  |  | 	push	%rbx | 
					
						
							|  |  |  | 	push	%rbp | 
					
						
							|  |  |  | 	push	%r12 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	mov	%rsp, %r12 | 
					
						
							|  |  |  | 	sub	$64, %rsp		# allocate workspace | 
					
						
							|  |  |  | 	and	$~15, %rsp		# align stack | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	mov	CTX, HASH_PTR | 
					
						
							|  |  |  | 	mov	BUF, BUFFER_PTR | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	shl	$6, CNT			# multiply by 64 | 
					
						
							|  |  |  | 	add	BUF, CNT | 
					
						
							|  |  |  | 	mov	CNT, BUFFER_END | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	lea	K_XMM_AR(%rip), K_BASE | 
					
						
							|  |  |  | 	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	SHA1_PIPELINED_MAIN_BODY | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	# cleanup workspace | 
					
						
							|  |  |  | 	mov	$8, %ecx | 
					
						
							|  |  |  | 	mov	%rsp, %rdi | 
					
						
							|  |  |  | 	xor	%rax, %rax | 
					
						
							|  |  |  | 	rep stosq | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	mov	%r12, %rsp		# deallocate workspace | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	pop	%r12 | 
					
						
							|  |  |  | 	pop	%rbp | 
					
						
							|  |  |  | 	pop	%rbx | 
					
						
							|  |  |  | 	ret | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-01-19 13:39:41 +02:00
										 |  |  | 	ENDPROC(\name) | 
					
						
							| 
									
										
										
										
											2011-08-04 20:19:25 +02:00
										 |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* | 
					
						
							|  |  |  |  * This macro implements 80 rounds of SHA-1 for one 64-byte block | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | .macro SHA1_PIPELINED_MAIN_BODY
 | 
					
						
							|  |  |  | 	INIT_REGALLOC | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	mov	  (HASH_PTR), A | 
					
						
							|  |  |  | 	mov	 4(HASH_PTR), B | 
					
						
							|  |  |  | 	mov	 8(HASH_PTR), C | 
					
						
							|  |  |  | 	mov	12(HASH_PTR), D | 
					
						
							|  |  |  | 	mov	16(HASH_PTR), E | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   .set i, 0 | 
					
						
							|  |  |  |   .rept W_PRECALC_AHEAD
 | 
					
						
							|  |  |  | 	W_PRECALC i | 
					
						
							|  |  |  |     .set i, (i+1) | 
					
						
							|  |  |  |   .endr | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .align 4
 | 
					
						
							|  |  |  | 1: | 
					
						
							|  |  |  | 	RR F1,A,B,C,D,E,0 | 
					
						
							|  |  |  | 	RR F1,D,E,A,B,C,2 | 
					
						
							|  |  |  | 	RR F1,B,C,D,E,A,4 | 
					
						
							|  |  |  | 	RR F1,E,A,B,C,D,6 | 
					
						
							|  |  |  | 	RR F1,C,D,E,A,B,8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F1,A,B,C,D,E,10 | 
					
						
							|  |  |  | 	RR F1,D,E,A,B,C,12 | 
					
						
							|  |  |  | 	RR F1,B,C,D,E,A,14 | 
					
						
							|  |  |  | 	RR F1,E,A,B,C,D,16 | 
					
						
							|  |  |  | 	RR F1,C,D,E,A,B,18 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F2,A,B,C,D,E,20 | 
					
						
							|  |  |  | 	RR F2,D,E,A,B,C,22 | 
					
						
							|  |  |  | 	RR F2,B,C,D,E,A,24 | 
					
						
							|  |  |  | 	RR F2,E,A,B,C,D,26 | 
					
						
							|  |  |  | 	RR F2,C,D,E,A,B,28 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F2,A,B,C,D,E,30 | 
					
						
							|  |  |  | 	RR F2,D,E,A,B,C,32 | 
					
						
							|  |  |  | 	RR F2,B,C,D,E,A,34 | 
					
						
							|  |  |  | 	RR F2,E,A,B,C,D,36 | 
					
						
							|  |  |  | 	RR F2,C,D,E,A,B,38 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F3,A,B,C,D,E,40 | 
					
						
							|  |  |  | 	RR F3,D,E,A,B,C,42 | 
					
						
							|  |  |  | 	RR F3,B,C,D,E,A,44 | 
					
						
							|  |  |  | 	RR F3,E,A,B,C,D,46 | 
					
						
							|  |  |  | 	RR F3,C,D,E,A,B,48 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F3,A,B,C,D,E,50 | 
					
						
							|  |  |  | 	RR F3,D,E,A,B,C,52 | 
					
						
							|  |  |  | 	RR F3,B,C,D,E,A,54 | 
					
						
							|  |  |  | 	RR F3,E,A,B,C,D,56 | 
					
						
							|  |  |  | 	RR F3,C,D,E,A,B,58 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	add	$64, BUFFER_PTR		# move to the next 64-byte block | 
					
						
							|  |  |  | 	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use | 
					
						
							|  |  |  | 	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F4,A,B,C,D,E,60 | 
					
						
							|  |  |  | 	RR F4,D,E,A,B,C,62 | 
					
						
							|  |  |  | 	RR F4,B,C,D,E,A,64 | 
					
						
							|  |  |  | 	RR F4,E,A,B,C,D,66 | 
					
						
							|  |  |  | 	RR F4,C,D,E,A,B,68 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RR F4,A,B,C,D,E,70 | 
					
						
							|  |  |  | 	RR F4,D,E,A,B,C,72 | 
					
						
							|  |  |  | 	RR F4,B,C,D,E,A,74 | 
					
						
							|  |  |  | 	RR F4,E,A,B,C,D,76 | 
					
						
							|  |  |  | 	RR F4,C,D,E,A,B,78 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	UPDATE_HASH   (HASH_PTR), A | 
					
						
							|  |  |  | 	UPDATE_HASH  4(HASH_PTR), B | 
					
						
							|  |  |  | 	UPDATE_HASH  8(HASH_PTR), C | 
					
						
							|  |  |  | 	UPDATE_HASH 12(HASH_PTR), D | 
					
						
							|  |  |  | 	UPDATE_HASH 16(HASH_PTR), E | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	RESTORE_RENAMED_REGS | 
					
						
							|  |  |  | 	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end | 
					
						
							|  |  |  | 	jne	1b | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro INIT_REGALLOC
 | 
					
						
							|  |  |  |   .set A, REG_A | 
					
						
							|  |  |  |   .set B, REG_B | 
					
						
							|  |  |  |   .set C, REG_C | 
					
						
							|  |  |  |   .set D, REG_D | 
					
						
							|  |  |  |   .set E, REG_E | 
					
						
							|  |  |  |   .set T1, REG_T1 | 
					
						
							|  |  |  |   .set T2, REG_T2 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro RESTORE_RENAMED_REGS
 | 
					
						
							|  |  |  | 	# order is important (REG_C is where it should be) | 
					
						
							|  |  |  | 	mov	B, REG_B | 
					
						
							|  |  |  | 	mov	D, REG_D | 
					
						
							|  |  |  | 	mov	A, REG_A | 
					
						
							|  |  |  | 	mov	E, REG_E | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro SWAP_REG_NAMES  a, b | 
					
						
							|  |  |  |   .set _T, \a | 
					
						
							|  |  |  |   .set \a, \b | 
					
						
							|  |  |  |   .set \b, _T | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro F1  b, c, d | 
					
						
							|  |  |  | 	mov	\c, T1 | 
					
						
							|  |  |  | 	SWAP_REG_NAMES \c, T1 | 
					
						
							|  |  |  | 	xor	\d, T1 | 
					
						
							|  |  |  | 	and	\b, T1 | 
					
						
							|  |  |  | 	xor	\d, T1 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro F2  b, c, d | 
					
						
							|  |  |  | 	mov	\d, T1 | 
					
						
							|  |  |  | 	SWAP_REG_NAMES \d, T1 | 
					
						
							|  |  |  | 	xor	\c, T1 | 
					
						
							|  |  |  | 	xor	\b, T1 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro F3  b, c ,d | 
					
						
							|  |  |  | 	mov	\c, T1 | 
					
						
							|  |  |  | 	SWAP_REG_NAMES \c, T1 | 
					
						
							|  |  |  | 	mov	\b, T2 | 
					
						
							|  |  |  | 	or	\b, T1 | 
					
						
							|  |  |  | 	and	\c, T2 | 
					
						
							|  |  |  | 	and	\d, T1 | 
					
						
							|  |  |  | 	or	T2, T1 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro F4  b, c, d | 
					
						
							|  |  |  | 	F2 \b, \c, \d | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro UPDATE_HASH  hash, val | 
					
						
							|  |  |  | 	add	\hash, \val | 
					
						
							|  |  |  | 	mov	\val, \hash | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* | 
					
						
							|  |  |  |  * RR does two rounds of SHA-1 back to back with W[] pre-calc | 
					
						
							|  |  |  |  *   t1 = F(b, c, d);   e += w(i)
 | 
					
						
							|  |  |  |  *   e += t1;           b <<= 30;   d  += w(i+1);
 | 
					
						
							|  |  |  |  *   t1 = F(a, b, c);
 | 
					
						
							|  |  |  |  *   d += t1;           a <<= 5;
 | 
					
						
							|  |  |  |  *   e += a;
 | 
					
						
							|  |  |  |  *   t1 = e;            a >>= 7;
 | 
					
						
							|  |  |  |  *   t1 <<= 5;
 | 
					
						
							|  |  |  |  *   d += t1;
 | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | .macro RR  F, a, b, c, d, e, round | 
					
						
							|  |  |  | 	add	WK(\round), \e | 
					
						
							|  |  |  | 	\F   \b, \c, \d		# t1 = F(b, c, d);
 | 
					
						
							|  |  |  | 	W_PRECALC (\round + W_PRECALC_AHEAD) | 
					
						
							|  |  |  | 	rol	$30, \b | 
					
						
							|  |  |  | 	add	T1, \e | 
					
						
							|  |  |  | 	add	WK(\round + 1), \d | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	\F   \a, \b, \c | 
					
						
							|  |  |  | 	W_PRECALC (\round + W_PRECALC_AHEAD + 1) | 
					
						
							|  |  |  | 	rol	$5, \a | 
					
						
							|  |  |  | 	add	\a, \e | 
					
						
							|  |  |  | 	add	T1, \d | 
					
						
							|  |  |  | 	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	mov	\e, T1 | 
					
						
							|  |  |  | 	SWAP_REG_NAMES \e, T1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	rol	$5, T1 | 
					
						
							|  |  |  | 	add	T1, \d | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	# write:  \a, \b | 
					
						
							|  |  |  | 	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC  r | 
					
						
							|  |  |  |   .set i, \r | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   .if (i < 20) | 
					
						
							|  |  |  |     .set K_XMM, 0 | 
					
						
							|  |  |  |   .elseif (i < 40) | 
					
						
							|  |  |  |     .set K_XMM, 16 | 
					
						
							|  |  |  |   .elseif (i < 60) | 
					
						
							|  |  |  |     .set K_XMM, 32 | 
					
						
							|  |  |  |   .elseif (i < 80) | 
					
						
							|  |  |  |     .set K_XMM, 48 | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) | 
					
						
							|  |  |  |     .set i, ((\r) % 80)	    # pre-compute for the next iteration | 
					
						
							|  |  |  |     .if (i == 0) | 
					
						
							|  |  |  | 	W_PRECALC_RESET | 
					
						
							|  |  |  |     .endif | 
					
						
							|  |  |  | 	W_PRECALC_00_15 | 
					
						
							|  |  |  |   .elseif (i<32) | 
					
						
							|  |  |  | 	W_PRECALC_16_31 | 
					
						
							|  |  |  |   .elseif (i < 80)   // rounds 32-79 | 
					
						
							|  |  |  | 	W_PRECALC_32_79 | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_RESET
 | 
					
						
							|  |  |  |   .set W,          W0 | 
					
						
							|  |  |  |   .set W_minus_04, W4 | 
					
						
							|  |  |  |   .set W_minus_08, W8 | 
					
						
							|  |  |  |   .set W_minus_12, W12 | 
					
						
							|  |  |  |   .set W_minus_16, W16 | 
					
						
							|  |  |  |   .set W_minus_20, W20 | 
					
						
							|  |  |  |   .set W_minus_24, W24 | 
					
						
							|  |  |  |   .set W_minus_28, W28 | 
					
						
							|  |  |  |   .set W_minus_32, W | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_ROTATE
 | 
					
						
							|  |  |  |   .set W_minus_32, W_minus_28 | 
					
						
							|  |  |  |   .set W_minus_28, W_minus_24 | 
					
						
							|  |  |  |   .set W_minus_24, W_minus_20 | 
					
						
							|  |  |  |   .set W_minus_20, W_minus_16 | 
					
						
							|  |  |  |   .set W_minus_16, W_minus_12 | 
					
						
							|  |  |  |   .set W_minus_12, W_minus_08 | 
					
						
							|  |  |  |   .set W_minus_08, W_minus_04 | 
					
						
							|  |  |  |   .set W_minus_04, W | 
					
						
							|  |  |  |   .set W,          W_minus_32 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_SSSE3
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_00_15
 | 
					
						
							|  |  |  | 	W_PRECALC_00_15_SSSE3 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | .macro W_PRECALC_16_31
 | 
					
						
							|  |  |  | 	W_PRECALC_16_31_SSSE3 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | .macro W_PRECALC_32_79
 | 
					
						
							|  |  |  | 	W_PRECALC_32_79_SSSE3 | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* message scheduling pre-compute for rounds 0-15 */ | 
					
						
							|  |  |  | .macro W_PRECALC_00_15_SSSE3
 | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	movdqu	(i*4)(BUFFER_PTR), W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	pshufb	XMM_SHUFB_BSWAP, W_TMP1 | 
					
						
							|  |  |  | 	movdqa	W_TMP1, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	paddd	(K_BASE), W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	movdqa  W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* message scheduling pre-compute for rounds 16-31 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * - calculating last 32 w[i] values in 8 XMM registers | 
					
						
							|  |  |  |  * - pre-calculate K+w[i] values and store to mem, for later load by ALU add | 
					
						
							|  |  |  |  *   instruction | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] | 
					
						
							|  |  |  |  * dependency, but improves for 32-79 | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | .macro W_PRECALC_16_31_SSSE3
 | 
					
						
							|  |  |  |   # blended scheduling of vector and scalar instruction streams, one 4-wide | 
					
						
							|  |  |  |   # vector iteration / 4 scalar rounds | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	movdqa	W_minus_12, W | 
					
						
							|  |  |  | 	palignr	$8, W_minus_16, W	# w[i-14] | 
					
						
							|  |  |  | 	movdqa	W_minus_04, W_TMP1 | 
					
						
							|  |  |  | 	psrldq	$4, W_TMP1		# w[i-3] | 
					
						
							|  |  |  | 	pxor	W_minus_08, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	pxor	W_minus_16, W_TMP1 | 
					
						
							|  |  |  | 	pxor	W_TMP1, W | 
					
						
							|  |  |  | 	movdqa	W, W_TMP2 | 
					
						
							|  |  |  | 	movdqa	W, W_TMP1 | 
					
						
							|  |  |  | 	pslldq	$12, W_TMP2 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	psrld	$31, W | 
					
						
							|  |  |  | 	pslld	$1, W_TMP1 | 
					
						
							|  |  |  | 	por	W, W_TMP1 | 
					
						
							|  |  |  | 	movdqa	W_TMP2, W | 
					
						
							|  |  |  | 	psrld	$30, W_TMP2 | 
					
						
							|  |  |  | 	pslld	$2, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	pxor	W, W_TMP1 | 
					
						
							|  |  |  | 	pxor	W_TMP2, W_TMP1 | 
					
						
							|  |  |  | 	movdqa	W_TMP1, W | 
					
						
							|  |  |  | 	paddd	K_XMM(K_BASE), W_TMP1 | 
					
						
							|  |  |  | 	movdqa	W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* message scheduling pre-compute for rounds 32-79 | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1 | 
					
						
							|  |  |  |  * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | 
					
						
							|  |  |  |  * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | .macro W_PRECALC_32_79_SSSE3
 | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	movdqa	W_minus_04, W_TMP1 | 
					
						
							|  |  |  | 	pxor	W_minus_28, W		# W is W_minus_32 before xor | 
					
						
							|  |  |  | 	palignr	$8, W_minus_08, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	pxor	W_minus_16, W | 
					
						
							|  |  |  | 	pxor	W_TMP1, W | 
					
						
							|  |  |  | 	movdqa	W, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	psrld	$30, W | 
					
						
							|  |  |  | 	pslld	$2, W_TMP1 | 
					
						
							|  |  |  | 	por	W, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	movdqa	W_TMP1, W | 
					
						
							|  |  |  | 	paddd	K_XMM(K_BASE), W_TMP1 | 
					
						
							|  |  |  | 	movdqa	W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .endm		// W_PRECALC_SSSE3 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define K1	0x5a827999 | 
					
						
							|  |  |  | #define K2	0x6ed9eba1 | 
					
						
							|  |  |  | #define K3	0x8f1bbcdc | 
					
						
							|  |  |  | #define K4	0xca62c1d6 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .section .rodata | 
					
						
							|  |  |  | .align 16
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | K_XMM_AR: | 
					
						
							|  |  |  | 	.long K1, K1, K1, K1 | 
					
						
							|  |  |  | 	.long K2, K2, K2, K2 | 
					
						
							|  |  |  | 	.long K3, K3, K3, K3 | 
					
						
							|  |  |  | 	.long K4, K4, K4, K4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | BSWAP_SHUFB_CTL: | 
					
						
							|  |  |  | 	.long 0x00010203
 | 
					
						
							|  |  |  | 	.long 0x04050607
 | 
					
						
							|  |  |  | 	.long 0x08090a0b
 | 
					
						
							|  |  |  | 	.long 0x0c0d0e0f
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .section .text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | W_PRECALC_SSSE3 | 
					
						
							|  |  |  | .macro xmm_mov a, b | 
					
						
							|  |  |  | 	movdqu	\a,\b | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* SSSE3 optimized implementation: | 
					
						
							|  |  |  |  *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, | 
					
						
							|  |  |  |  *                                       unsigned int rounds);
 | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | SHA1_VECTOR_ASM     sha1_transform_ssse3 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-05-24 11:13:42 +02:00
										 |  |  | #ifdef CONFIG_AS_AVX | 
					
						
							| 
									
										
										
										
											2011-08-04 20:19:25 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_AVX
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .purgem W_PRECALC_00_15
 | 
					
						
							|  |  |  | .macro  W_PRECALC_00_15
 | 
					
						
							|  |  |  |     W_PRECALC_00_15_AVX | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | .purgem W_PRECALC_16_31
 | 
					
						
							|  |  |  | .macro  W_PRECALC_16_31
 | 
					
						
							|  |  |  |     W_PRECALC_16_31_AVX | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | .purgem W_PRECALC_32_79
 | 
					
						
							|  |  |  | .macro  W_PRECALC_32_79
 | 
					
						
							|  |  |  |     W_PRECALC_32_79_AVX | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_00_15_AVX
 | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	vpaddd	(K_BASE), W, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	vmovdqa	W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_16_31_AVX
 | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14] | 
					
						
							|  |  |  | 	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3] | 
					
						
							|  |  |  | 	vpxor	W_minus_08, W, W | 
					
						
							|  |  |  | 	vpxor	W_minus_16, W_TMP1, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	vpxor	W_TMP1, W, W | 
					
						
							|  |  |  | 	vpslldq	$12, W, W_TMP2 | 
					
						
							|  |  |  | 	vpslld	$1, W, W_TMP1 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	vpsrld	$31, W, W | 
					
						
							|  |  |  | 	vpor	W, W_TMP1, W_TMP1 | 
					
						
							|  |  |  | 	vpslld	$2, W_TMP2, W | 
					
						
							|  |  |  | 	vpsrld	$30, W_TMP2, W_TMP2 | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	vpxor	W, W_TMP1, W_TMP1 | 
					
						
							|  |  |  | 	vpxor	W_TMP2, W_TMP1, W | 
					
						
							|  |  |  | 	vpaddd	K_XMM(K_BASE), W, W_TMP1 | 
					
						
							|  |  |  | 	vmovdqu	W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .macro W_PRECALC_32_79_AVX
 | 
					
						
							|  |  |  |   .if ((i & 3) == 0) | 
					
						
							|  |  |  | 	vpalignr $8, W_minus_08, W_minus_04, W_TMP1 | 
					
						
							|  |  |  | 	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor | 
					
						
							|  |  |  |   .elseif ((i & 3) == 1) | 
					
						
							|  |  |  | 	vpxor	W_minus_16, W_TMP1, W_TMP1 | 
					
						
							|  |  |  | 	vpxor	W_TMP1, W, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 2) | 
					
						
							|  |  |  | 	vpslld	$2, W, W_TMP1 | 
					
						
							|  |  |  | 	vpsrld	$30, W, W | 
					
						
							|  |  |  | 	vpor	W, W_TMP1, W | 
					
						
							|  |  |  |   .elseif ((i & 3) == 3) | 
					
						
							|  |  |  | 	vpaddd	K_XMM(K_BASE), W, W_TMP1 | 
					
						
							|  |  |  | 	vmovdqu	W_TMP1, WK(i&~3) | 
					
						
							|  |  |  | 	W_PRECALC_ROTATE | 
					
						
							|  |  |  |   .endif | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .endm    // W_PRECALC_AVX | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | W_PRECALC_AVX | 
					
						
							|  |  |  | .purgem xmm_mov
 | 
					
						
							|  |  |  | .macro xmm_mov a, b | 
					
						
							|  |  |  | 	vmovdqu	\a,\b | 
					
						
							|  |  |  | .endm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* AVX optimized implementation: | 
					
						
							|  |  |  |  *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, | 
					
						
							|  |  |  |  *                                     unsigned int rounds);
 | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | SHA1_VECTOR_ASM     sha1_transform_avx | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #endif |