From 3c77c1eb2b01f6306cf9f7bb9e74522f91ed6d58 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Fri, 18 May 2018 18:50:07 +1000 Subject: [PATCH] NASM version of a SSE2 memcpy --- common/memcpySSE.asm | 312 +++++++++++++++++++++++++++++++++++++++++++ common/memcpySSE2.h | 40 +++--- 2 files changed, 334 insertions(+), 18 deletions(-) create mode 100644 common/memcpySSE.asm diff --git a/common/memcpySSE.asm b/common/memcpySSE.asm new file mode 100644 index 00000000..498ca483 --- /dev/null +++ b/common/memcpySSE.asm @@ -0,0 +1,312 @@ +.code + +memcpySSE proc + ; dst = rcx + ; src = rdx + ; len = r8 + + test r8 , r8 + jne OK + ret + + OK: + ; void * end = dst + (length & ~0x100); + ; end = r10 + mov r9 , r8 + and r9 , -0100h + mov r10, rcx + add r10, r9 + + ; size_t rem = (length & 0xFF) >> 4); + ; rem = r11 + mov r11, r8 + and r11, 0FFh + shr r11, 4 + + sub rsp, 8 + 10*16 + 4*8 + movdqa oword ptr [rsp + 4*8 + 00 ], xmm6 + movdqa oword ptr [rsp + 4*8 + 16 ], xmm7 + movdqa oword ptr [rsp + 4*8 + 32 ], xmm8 + movdqa oword ptr [rsp + 4*8 + 48 ], xmm9 + movdqa oword ptr [rsp + 4*8 + 64 ], xmm10 + movdqa oword ptr [rsp + 4*8 + 80 ], xmm11 + movdqa oword ptr [rsp + 4*8 + 96 ], xmm12 + movdqa oword ptr [rsp + 4*8 + 112], xmm13 + movdqa oword ptr [rsp + 4*8 + 128], xmm14 + movdqa oword ptr [rsp + 4*8 + 144], xmm15 + + cmp rcx, r10 + je RemainingBlocks + + FullLoop: + vmovaps xmm0 , xmmword ptr [rdx + 000h] + vmovaps xmm1 , xmmword ptr [rdx + 010h] + vmovaps xmm2 , xmmword ptr [rdx + 020h] + vmovaps xmm3 , xmmword ptr [rdx + 030h] + vmovaps xmm4 , xmmword ptr [rdx + 040h] + vmovaps xmm5 , xmmword ptr [rdx + 050h] + vmovaps xmm6 , xmmword ptr [rdx + 060h] + vmovaps xmm7 , xmmword ptr [rdx + 070h] + vmovaps xmm8 , xmmword ptr [rdx + 080h] + vmovaps xmm9 , xmmword ptr [rdx + 090h] + vmovaps xmm10, xmmword ptr [rdx + 0A0h] + vmovaps xmm11, xmmword ptr [rdx + 0B0h] + vmovaps xmm12, xmmword ptr [rdx + 0C0h] + vmovaps xmm13, xmmword ptr [rdx + 0D0h] + vmovaps xmm14, xmmword ptr [rdx + 0E0h] + vmovaps xmm15, xmmword ptr [rdx + 0F0h] + vmovntdq xmmword ptr [rcx + 000h], xmm0 + vmovntdq xmmword ptr [rcx + 010h], xmm1 + vmovntdq xmmword ptr [rcx + 020h], xmm2 + vmovntdq xmmword ptr [rcx + 030h], xmm3 + vmovntdq xmmword ptr [rcx + 040h], xmm4 + vmovntdq xmmword ptr [rcx + 050h], xmm5 + vmovntdq xmmword ptr [rcx + 060h], xmm6 + vmovntdq xmmword ptr [rcx + 070h], xmm7 + vmovntdq xmmword ptr [rcx + 080h], xmm8 + vmovntdq xmmword ptr [rcx + 090h], xmm9 + vmovntdq xmmword ptr [rcx + 0A0h], xmm10 + vmovntdq xmmword ptr [rcx + 0B0h], xmm11 + vmovntdq xmmword ptr [rcx + 0C0h], xmm12 + vmovntdq xmmword ptr [rcx + 0D0h], xmm13 + vmovntdq xmmword ptr [rcx + 0E0h], xmm14 + vmovntdq xmmword ptr [rcx + 0F0h], xmm15 + add rdx, 0100h + add rcx, 0100h + cmp rcx, r10 + jne FullLoop + + RemainingBlocks: + lea r9 , JumpTable + mov r10, 15 + sub r10, r11 + imul r10, 5 + add r9 , r10 + jmp r9 + + JumpTable: + jmp Block15 + jmp Block14 + jmp Block13 + jmp Block12 + jmp Block11 + jmp Block10 + jmp Block9 + jmp Block8 + jmp Block7 + jmp Block6 + jmp Block5 + jmp Block4 + jmp Block3 + jmp Block2 + jmp Block1 + jmp Block0 + + ; ensure we generate near jumps + padding1 db 127 dup(090h) + + Block15: + vmovaps xmm14, xmmword ptr [rdx + 0E0h] + vmovntdq xmmword ptr [rcx + 0E0h], xmm14 + Block14: + vmovaps xmm13, xmmword ptr [rdx + 0D0h] + vmovntdq xmmword ptr [rcx + 0D0h], xmm13 + Block13: + vmovaps xmm12, xmmword ptr [rdx + 0C0h] + vmovntdq xmmword ptr [rcx + 0C0h], xmm12 + Block12: + vmovaps xmm11, xmmword ptr [rdx + 0B0h] + vmovntdq xmmword ptr [rcx + 0B0h], xmm11 + Block11: + vmovaps xmm10, xmmword ptr [rdx + 0A0h] + vmovntdq xmmword ptr [rcx + 0A0h], xmm10 + Block10: + vmovaps xmm9 , xmmword ptr [rdx + 090h] + vmovntdq xmmword ptr [rcx + 090h], xmm9 + Block9: + vmovaps xmm8 , xmmword ptr [rdx + 080h] + vmovntdq xmmword ptr [rcx + 080h], xmm8 + Block8: + vmovaps xmm7 , xmmword ptr [rdx + 070h] + vmovntdq xmmword ptr [rcx + 070h], xmm7 + Block7: + vmovaps xmm6 , xmmword ptr [rdx + 060h] + vmovntdq xmmword ptr [rcx + 060h], xmm6 + Block6: + vmovaps xmm5 , xmmword ptr [rdx + 050h] + vmovntdq xmmword ptr [rcx + 050h], xmm5 + Block5: + vmovaps xmm4 , xmmword ptr [rdx + 040h] + vmovntdq xmmword ptr [rcx + 040h], xmm4 + Block4: + vmovaps xmm3 , xmmword ptr [rdx + 030h] + vmovntdq xmmword ptr [rcx + 030h], xmm3 + Block3: + vmovaps xmm2 , xmmword ptr [rdx + 020h] + vmovntdq xmmword ptr [rcx + 020h], xmm2 + Block2: + vmovaps xmm1 , xmmword ptr [rdx + 010h] + vmovntdq xmmword ptr [rcx + 010h], xmm1 + Block1: + vmovaps xmm0 , xmmword ptr [rdx + 000h] + vmovntdq xmmword ptr [rcx + 000h], xmm0 + + imul r11, 16 + add rdx, r11 + add rcx, r11 + + Block0: + movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ] + movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ] + movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ] + movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ] + movdqa xmm10, oword ptr [rsp + 4*8 + 64 ] + movdqa xmm11, oword ptr [rsp + 4*8 + 80 ] + movdqa xmm12, oword ptr [rsp + 4*8 + 96 ] + movdqa xmm13, oword ptr [rsp + 4*8 + 112] + movdqa xmm14, oword ptr [rsp + 4*8 + 128] + movdqa xmm15, oword ptr [rsp + 4*8 + 144] + add rsp, 8 + 10*16 + 4*8 + + and r8, 0Fh + imul r8, 5 + lea r9, CopyTable + add r9, r8 + jmp r9 + + CopyTable: + ret + nop + nop + nop + nop + + jmp Copy1 + jmp Copy2 + jmp Copy3 + jmp Copy4 + jmp Copy5 + jmp Copy6 + jmp Copy7 + jmp Copy8 + jmp Copy9 + jmp Copy10 + jmp Copy11 + jmp Copy12 + jmp Copy13 + jmp Copy14 + + ; copy 15 + mov r8 , qword ptr [rdx + 00h] + mov r9d , dword ptr [rdx + 08h] + mov r10w, word ptr [rdx + 0Ch] + mov al , byte ptr [rdx + 0Eh] + mov qword ptr [rcx + 00h], r8 + mov dword ptr [rcx + 08h], r9d + mov word ptr [rcx + 0Ch], r10w + mov byte ptr [rcx + 0Eh], al + ret + + ; ensure we generate near jumps + padding2 db 127 dup(090h) + + Copy1: + mov al, byte ptr [rdx] + mov byte ptr [rcx], al + ret + + Copy2: + mov r10w, word ptr [rdx] + mov word ptr [rcx], r10w + ret + + Copy3: + mov r10w, word ptr [rdx] + mov word ptr [rcx], r10w + mov al, byte ptr [rdx + 02h] + mov byte ptr [rcx + 02h], al + ret + + Copy4: + mov r9d , dword ptr [rdx] + mov dword ptr [rcx], r9d + ret + + Copy5: + mov r9d , dword ptr [rdx] + mov dword ptr [rcx], r9d + mov al, byte ptr [rdx + 04h] + mov byte ptr [rcx + 04h], al + ret + + Copy6: + mov r9d , dword ptr [rdx] + mov dword ptr [rcx], r9d + mov r10w, word ptr [rdx + 04h] + mov word ptr [rcx + 04h], r10w + ret + + Copy7: + mov r9d , dword ptr [rdx] + mov dword ptr [rcx], r9d + mov r10w, word ptr [rdx + 04h] + mov word ptr [rcx + 04h], r10w + mov al, byte ptr [rdx + 06h] + mov byte ptr [rcx + 06h], al + ret + + Copy8: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + ret + + Copy9: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + mov al, byte ptr [rdx + 08h] + mov byte ptr [rcx + 08h], al + ret + + Copy10: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + mov r10w, word ptr [rdx + 08h] + mov word ptr [rcx + 08h], r10w + ret + + Copy11: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + mov r10w, word ptr [rdx + 08h] + mov word ptr [rcx + 08h], r10w + mov al, byte ptr [rdx + 0Ah] + mov byte ptr [rcx + 0Ah], al + ret + + Copy12: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + mov r9d , dword ptr [rdx + 08h] + mov dword ptr [rcx + 08h], r9d + ret + + Copy13: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + mov r9d , dword ptr [rdx + 08h] + mov dword ptr [rcx + 08h], r9d + mov al, byte ptr [rdx + 0Ch] + mov byte ptr [rcx + 0Ch], al + ret + + Copy14: + mov r8 , qword ptr [rdx ] + mov r9d , dword ptr [rdx + 08h] + mov r10w, word ptr [rdx + 0Ch] + mov qword ptr [rcx ], r8 + mov dword ptr [rcx + 08h], r9d + mov word ptr [rcx + 0Ch], r10w + ret + +memcpySSE endp +end \ No newline at end of file diff --git a/common/memcpySSE2.h b/common/memcpySSE2.h index 6aae8de5..14d89dd9 100644 --- a/common/memcpySSE2.h +++ b/common/memcpySSE2.h @@ -26,6 +26,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "debug.h" +#if defined(__GNUC___) || defined(__GNUG__) #define OP(...) #__VA_ARGS__ "\n\t" inline static void memcpySSE(void *dst, const void * src, size_t length) @@ -79,7 +80,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length) OP(add %[rem],%[end]) OP(jmp *%[end]) - // jump table + // jump table OP(vmovaps 0x60(%[src]),%%xmm0) OP(vmovntdq %%xmm0,0x60(%[dst])) OP(vmovaps 0x50(%[src]),%%xmm1) @@ -95,28 +96,28 @@ inline static void memcpySSE(void *dst, const void * src, size_t length) OP(vmovaps 0x00(%[src]),%%xmm6) OP(vmovntdq %%xmm6,0x00(%[dst])) - // alignment as the previous two instructions are only 4 bytes + // alignment as the previous two instructions are only 4 bytes OP(nop) OP(nop) - // restore the registers + // restore the registers OP(pop %[end]) OP(pop %[src]) OP(pop %[dst]) - : - : [dst]"r" (dst), - [src]"r" (src), - [end]"c" (end), - [rem]"d" (rem) - : "xmm0", - "xmm1", - "xmm2", - "xmm3", - "xmm4", - "xmm5", - "xmm6", - "xmm7", - "memory" + : + : [dst]"r" (dst), + [src]"r" (src), + [end]"c" (end), + [rem]"d" (rem) + : "xmm0", + "xmm1", + "xmm2", + "xmm3", + "xmm4", + "xmm5", + "xmm6", + "xmm7", + "memory" ); //copy any remaining bytes @@ -126,4 +127,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length) #else memcpy(dst, src, length); #endif -} \ No newline at end of file +} +#else +extern "C" void __fastcall memcpySSE(void *dst, const void * src, size_t length); +#endif \ No newline at end of file