looking-glass/common/memcpySSE.asm
2018-05-22 18:59:24 +10:00

237 lines
No EOL
5 KiB
NASM

.code
memcpySSE proc
; dst = rcx
; src = rdx
; len = r8
mov rax, rcx
test r8, r8
jz @Exit
cmp rcx, rdx
je @Exit
sub rsp, 8 + 2*16 + 4*8
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
; void * end = dst + (length & ~0x7F);
; end = r10
mov r9 , r8
and r9 , 0FFFFFFFFFFFFFF80h
jz @RemainingBlocks
mov r10, rcx
add r10, r9
@FullLoop:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovntdq xmmword ptr [rcx + 070h], xmm7
add rdx, 080h
add rcx, 080h
cmp rcx, r10
jne @FullLoop
@RemainingBlocks:
; size_t rem = (length & 0x7F) >> 4);
; rem = r11
mov r11, r8
and r11, 07Fh
jz @RestoreExit
shr r11, 4
jz @FinalBytes
mov r10, 7
sub r10, r11
imul r10, 10
lea r9 , @FinalBlocks
add r9 , r10
jmp r9
@RestoreExit:
movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
add rsp, 8 + 2*16 + 4*8
@Exit:
ret
@FinalBlocks:
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
add rsp, 8 + 2*16 + 4*8
sfence
shl r11, 4
add rdx, r11
add rcx, r11
@FinalBytes:
and r8, 0Fh
jz @Exit
imul r8, 5
lea r9, @FinalBytesTable
add r9, r8
jmp r9
@FinalBytesTable:
jmp @Copy1
jmp @Copy2
jmp @Copy3
jmp @Copy4
jmp @Copy5
jmp @Copy6
jmp @Copy7
jmp @Copy8
jmp @Copy9
jmp @Copy10
jmp @Copy11
jmp @Copy12
jmp @Copy13
jmp @Copy14
jmp @Copy15
db 128 DUP(0CCh)
; fall through - 1 byte
@Copy1:
mov al, byte ptr [rdx]
mov byte ptr [rcx], al
ret
@Copy2:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
ret
@Copy3:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
mov r11b, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], r11b
ret
@Copy4:
mov r9d, dword ptr [rdx]
mov dword ptr [rcx], r9d
ret
@Copy5:
mov r9d, dword ptr [rdx ]
mov r11b , byte ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov byte ptr [rcx + 04h], r11b
ret
@Copy6:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
ret
@Copy7:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov r11b, byte ptr [rdx + 06h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
mov byte ptr [rcx + 06h], r11b
ret
@Copy8:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
ret
@Copy9:
mov r8 , qword ptr [rdx ]
mov r11b, byte ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov byte ptr [rcx + 08h], r11b
ret
@Copy10:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
ret
@Copy11:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov r11b, byte ptr [rdx + 0Ah]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
mov byte ptr [rcx + 0Ah], r11b
ret
@Copy12:
mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
ret
@Copy13:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r11b, byte ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov byte ptr [rcx + 0Ch], r11b
ret
@Copy14:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
ret
; copy 15
@Copy15:
mov r8 , qword ptr [rdx + 00h]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov r11b, byte ptr [rdx + 0Eh]
mov qword ptr [rcx + 00h], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
mov byte ptr [rcx + 0Eh], r11b
ret
memcpySSE endp
end