 d50ba3687b
			
		
	
	
	d50ba3687b
	
	
	
		
			
			As suggested by Peter Anvin. Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Cc: H . Peter Anvin <hpa@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
		
			
				
	
	
		
			223 lines
		
	
	
	
		
			3.7 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			223 lines
		
	
	
	
		
			3.7 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Normally compiler builtins are used, but sometimes the compiler calls out
 | |
|  * of line code. Based on asm-i386/string.h.
 | |
|  *
 | |
|  * This assembly file is re-written from memmove_64.c file.
 | |
|  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
 | |
|  */
 | |
| #define _STRING_C
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/dwarf2.h>
 | |
| #include <asm/cpufeature.h>
 | |
| #include <asm/alternative-asm.h>
 | |
| 
 | |
| #undef memmove
 | |
| 
 | |
| /*
 | |
|  * Implement memmove(). This can handle overlap between src and dst.
 | |
|  *
 | |
|  * Input:
 | |
|  * rdi: dest
 | |
|  * rsi: src
 | |
|  * rdx: count
 | |
|  *
 | |
|  * Output:
 | |
|  * rax: dest
 | |
|  */
 | |
| ENTRY(memmove)
 | |
| 	CFI_STARTPROC
 | |
| 
 | |
| 	/* Handle more 32 bytes in loop */
 | |
| 	mov %rdi, %rax
 | |
| 	cmp $0x20, %rdx
 | |
| 	jb	1f
 | |
| 
 | |
| 	/* Decide forward/backward copy mode */
 | |
| 	cmp %rdi, %rsi
 | |
| 	jge .Lmemmove_begin_forward
 | |
| 	mov %rsi, %r8
 | |
| 	add %rdx, %r8
 | |
| 	cmp %rdi, %r8
 | |
| 	jg 2f
 | |
| 
 | |
| .Lmemmove_begin_forward:
 | |
| 	/*
 | |
| 	 * movsq instruction have many startup latency
 | |
| 	 * so we handle small size by general register.
 | |
| 	 */
 | |
| 	cmp  $680, %rdx
 | |
| 	jb	3f
 | |
| 	/*
 | |
| 	 * movsq instruction is only good for aligned case.
 | |
| 	 */
 | |
| 
 | |
| 	cmpb %dil, %sil
 | |
| 	je 4f
 | |
| 3:
 | |
| 	sub $0x20, %rdx
 | |
| 	/*
 | |
| 	 * We gobble 32 bytes forward in each loop.
 | |
| 	 */
 | |
| 5:
 | |
| 	sub $0x20, %rdx
 | |
| 	movq 0*8(%rsi), %r11
 | |
| 	movq 1*8(%rsi), %r10
 | |
| 	movq 2*8(%rsi), %r9
 | |
| 	movq 3*8(%rsi), %r8
 | |
| 	leaq 4*8(%rsi), %rsi
 | |
| 
 | |
| 	movq %r11, 0*8(%rdi)
 | |
| 	movq %r10, 1*8(%rdi)
 | |
| 	movq %r9, 2*8(%rdi)
 | |
| 	movq %r8, 3*8(%rdi)
 | |
| 	leaq 4*8(%rdi), %rdi
 | |
| 	jae 5b
 | |
| 	addq $0x20, %rdx
 | |
| 	jmp 1f
 | |
| 	/*
 | |
| 	 * Handle data forward by movsq.
 | |
| 	 */
 | |
| 	.p2align 4
 | |
| 4:
 | |
| 	movq %rdx, %rcx
 | |
| 	movq -8(%rsi, %rdx), %r11
 | |
| 	lea -8(%rdi, %rdx), %r10
 | |
| 	shrq $3, %rcx
 | |
| 	rep movsq
 | |
| 	movq %r11, (%r10)
 | |
| 	jmp 13f
 | |
| .Lmemmove_end_forward:
 | |
| 
 | |
| 	/*
 | |
| 	 * Handle data backward by movsq.
 | |
| 	 */
 | |
| 	.p2align 4
 | |
| 7:
 | |
| 	movq %rdx, %rcx
 | |
| 	movq (%rsi), %r11
 | |
| 	movq %rdi, %r10
 | |
| 	leaq -8(%rsi, %rdx), %rsi
 | |
| 	leaq -8(%rdi, %rdx), %rdi
 | |
| 	shrq $3, %rcx
 | |
| 	std
 | |
| 	rep movsq
 | |
| 	cld
 | |
| 	movq %r11, (%r10)
 | |
| 	jmp 13f
 | |
| 
 | |
| 	/*
 | |
| 	 * Start to prepare for backward copy.
 | |
| 	 */
 | |
| 	.p2align 4
 | |
| 2:
 | |
| 	cmp $680, %rdx
 | |
| 	jb 6f
 | |
| 	cmp %dil, %sil
 | |
| 	je 7b
 | |
| 6:
 | |
| 	/*
 | |
| 	 * Calculate copy position to tail.
 | |
| 	 */
 | |
| 	addq %rdx, %rsi
 | |
| 	addq %rdx, %rdi
 | |
| 	subq $0x20, %rdx
 | |
| 	/*
 | |
| 	 * We gobble 32 bytes backward in each loop.
 | |
| 	 */
 | |
| 8:
 | |
| 	subq $0x20, %rdx
 | |
| 	movq -1*8(%rsi), %r11
 | |
| 	movq -2*8(%rsi), %r10
 | |
| 	movq -3*8(%rsi), %r9
 | |
| 	movq -4*8(%rsi), %r8
 | |
| 	leaq -4*8(%rsi), %rsi
 | |
| 
 | |
| 	movq %r11, -1*8(%rdi)
 | |
| 	movq %r10, -2*8(%rdi)
 | |
| 	movq %r9, -3*8(%rdi)
 | |
| 	movq %r8, -4*8(%rdi)
 | |
| 	leaq -4*8(%rdi), %rdi
 | |
| 	jae 8b
 | |
| 	/*
 | |
| 	 * Calculate copy position to head.
 | |
| 	 */
 | |
| 	addq $0x20, %rdx
 | |
| 	subq %rdx, %rsi
 | |
| 	subq %rdx, %rdi
 | |
| 1:
 | |
| 	cmpq $16, %rdx
 | |
| 	jb 9f
 | |
| 	/*
 | |
| 	 * Move data from 16 bytes to 31 bytes.
 | |
| 	 */
 | |
| 	movq 0*8(%rsi), %r11
 | |
| 	movq 1*8(%rsi), %r10
 | |
| 	movq -2*8(%rsi, %rdx), %r9
 | |
| 	movq -1*8(%rsi, %rdx), %r8
 | |
| 	movq %r11, 0*8(%rdi)
 | |
| 	movq %r10, 1*8(%rdi)
 | |
| 	movq %r9, -2*8(%rdi, %rdx)
 | |
| 	movq %r8, -1*8(%rdi, %rdx)
 | |
| 	jmp 13f
 | |
| 	.p2align 4
 | |
| 9:
 | |
| 	cmpq $8, %rdx
 | |
| 	jb 10f
 | |
| 	/*
 | |
| 	 * Move data from 8 bytes to 15 bytes.
 | |
| 	 */
 | |
| 	movq 0*8(%rsi), %r11
 | |
| 	movq -1*8(%rsi, %rdx), %r10
 | |
| 	movq %r11, 0*8(%rdi)
 | |
| 	movq %r10, -1*8(%rdi, %rdx)
 | |
| 	jmp 13f
 | |
| 10:
 | |
| 	cmpq $4, %rdx
 | |
| 	jb 11f
 | |
| 	/*
 | |
| 	 * Move data from 4 bytes to 7 bytes.
 | |
| 	 */
 | |
| 	movl (%rsi), %r11d
 | |
| 	movl -4(%rsi, %rdx), %r10d
 | |
| 	movl %r11d, (%rdi)
 | |
| 	movl %r10d, -4(%rdi, %rdx)
 | |
| 	jmp 13f
 | |
| 11:
 | |
| 	cmp $2, %rdx
 | |
| 	jb 12f
 | |
| 	/*
 | |
| 	 * Move data from 2 bytes to 3 bytes.
 | |
| 	 */
 | |
| 	movw (%rsi), %r11w
 | |
| 	movw -2(%rsi, %rdx), %r10w
 | |
| 	movw %r11w, (%rdi)
 | |
| 	movw %r10w, -2(%rdi, %rdx)
 | |
| 	jmp 13f
 | |
| 12:
 | |
| 	cmp $1, %rdx
 | |
| 	jb 13f
 | |
| 	/*
 | |
| 	 * Move data for 1 byte.
 | |
| 	 */
 | |
| 	movb (%rsi), %r11b
 | |
| 	movb %r11b, (%rdi)
 | |
| 13:
 | |
| 	retq
 | |
| 	CFI_ENDPROC
 | |
| 
 | |
| 	.section .altinstr_replacement,"ax"
 | |
| .Lmemmove_begin_forward_efs:
 | |
| 	/* Forward moving data. */
 | |
| 	movq %rdx, %rcx
 | |
| 	rep movsb
 | |
| 	retq
 | |
| .Lmemmove_end_forward_efs:
 | |
| 	.previous
 | |
| 
 | |
| 	.section .altinstructions,"a"
 | |
| 	altinstruction_entry .Lmemmove_begin_forward,		\
 | |
| 		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
 | |
| 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
 | |
| 		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 | |
| 	.previous
 | |
| ENDPROC(memmove)
 |