arm64: lib: Implement optimized memcpy routine
This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memcpy() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
This commit is contained in:
		
					parent
					
						
							
								74d2eb3cdb
							
						
					
				
			
			
				commit
				
					
						808dbac6b5
					
				
			
		
					 1 changed files with 170 additions and 22 deletions
				
			
		| 
						 | 
				
			
			@ -1,5 +1,13 @@
 | 
			
		|||
/*
 | 
			
		||||
 * Copyright (C) 2013 ARM Ltd.
 | 
			
		||||
 * Copyright (C) 2013 Linaro.
 | 
			
		||||
 *
 | 
			
		||||
 * This code is based on glibc cortex strings work originally authored by Linaro
 | 
			
		||||
 * and re-licensed under GPLv2 for the Linux kernel. The original code can
 | 
			
		||||
 * be found @
 | 
			
		||||
 *
 | 
			
		||||
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 | 
			
		||||
 * files/head:/src/aarch64/
 | 
			
		||||
 *
 | 
			
		||||
 * This program is free software; you can redistribute it and/or modify
 | 
			
		||||
 * it under the terms of the GNU General Public License version 2 as
 | 
			
		||||
| 
						 | 
				
			
			@ -16,6 +24,7 @@
 | 
			
		|||
 | 
			
		||||
#include <linux/linkage.h>
 | 
			
		||||
#include <asm/assembler.h>
 | 
			
		||||
#include <asm/cache.h>
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Copy a buffer from src to dest (alignment handled by the hardware)
 | 
			
		||||
| 
						 | 
				
			
			@ -27,27 +36,166 @@
 | 
			
		|||
 * Returns:
 | 
			
		||||
 *	x0 - dest
 | 
			
		||||
 */
 | 
			
		||||
dstin	.req	x0
 | 
			
		||||
src	.req	x1
 | 
			
		||||
count	.req	x2
 | 
			
		||||
tmp1	.req	x3
 | 
			
		||||
tmp1w	.req	w3
 | 
			
		||||
tmp2	.req	x4
 | 
			
		||||
tmp2w	.req	w4
 | 
			
		||||
tmp3	.req	x5
 | 
			
		||||
tmp3w	.req	w5
 | 
			
		||||
dst	.req	x6
 | 
			
		||||
 | 
			
		||||
A_l	.req	x7
 | 
			
		||||
A_h	.req	x8
 | 
			
		||||
B_l	.req	x9
 | 
			
		||||
B_h	.req	x10
 | 
			
		||||
C_l	.req	x11
 | 
			
		||||
C_h	.req	x12
 | 
			
		||||
D_l	.req	x13
 | 
			
		||||
D_h	.req	x14
 | 
			
		||||
 | 
			
		||||
ENTRY(memcpy)
 | 
			
		||||
	mov	x4, x0
 | 
			
		||||
	subs	x2, x2, #8
 | 
			
		||||
	b.mi	2f
 | 
			
		||||
1:	ldr	x3, [x1], #8
 | 
			
		||||
	subs	x2, x2, #8
 | 
			
		||||
	str	x3, [x4], #8
 | 
			
		||||
	b.pl	1b
 | 
			
		||||
2:	adds	x2, x2, #4
 | 
			
		||||
	b.mi	3f
 | 
			
		||||
	ldr	w3, [x1], #4
 | 
			
		||||
	sub	x2, x2, #4
 | 
			
		||||
	str	w3, [x4], #4
 | 
			
		||||
3:	adds	x2, x2, #2
 | 
			
		||||
	b.mi	4f
 | 
			
		||||
	ldrh	w3, [x1], #2
 | 
			
		||||
	sub	x2, x2, #2
 | 
			
		||||
	strh	w3, [x4], #2
 | 
			
		||||
4:	adds	x2, x2, #1
 | 
			
		||||
	b.mi	5f
 | 
			
		||||
	ldrb	w3, [x1]
 | 
			
		||||
	strb	w3, [x4]
 | 
			
		||||
5:	ret
 | 
			
		||||
	mov	dst, dstin
 | 
			
		||||
	cmp	count, #16
 | 
			
		||||
	/*When memory length is less than 16, the accessed are not aligned.*/
 | 
			
		||||
	b.lo	.Ltiny15
 | 
			
		||||
 | 
			
		||||
	neg	tmp2, src
 | 
			
		||||
	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
 | 
			
		||||
	b.eq	.LSrcAligned
 | 
			
		||||
	sub	count, count, tmp2
 | 
			
		||||
	/*
 | 
			
		||||
	* Copy the leading memory data from src to dst in an increasing
 | 
			
		||||
	* address order.By this way,the risk of overwritting the source
 | 
			
		||||
	* memory data is eliminated when the distance between src and
 | 
			
		||||
	* dst is less than 16. The memory accesses here are alignment.
 | 
			
		||||
	*/
 | 
			
		||||
	tbz	tmp2, #0, 1f
 | 
			
		||||
	ldrb	tmp1w, [src], #1
 | 
			
		||||
	strb	tmp1w, [dst], #1
 | 
			
		||||
1:
 | 
			
		||||
	tbz	tmp2, #1, 2f
 | 
			
		||||
	ldrh	tmp1w, [src], #2
 | 
			
		||||
	strh	tmp1w, [dst], #2
 | 
			
		||||
2:
 | 
			
		||||
	tbz	tmp2, #2, 3f
 | 
			
		||||
	ldr	tmp1w, [src], #4
 | 
			
		||||
	str	tmp1w, [dst], #4
 | 
			
		||||
3:
 | 
			
		||||
	tbz	tmp2, #3, .LSrcAligned
 | 
			
		||||
	ldr	tmp1, [src],#8
 | 
			
		||||
	str	tmp1, [dst],#8
 | 
			
		||||
 | 
			
		||||
.LSrcAligned:
 | 
			
		||||
	cmp	count, #64
 | 
			
		||||
	b.ge	.Lcpy_over64
 | 
			
		||||
	/*
 | 
			
		||||
	* Deal with small copies quickly by dropping straight into the
 | 
			
		||||
	* exit block.
 | 
			
		||||
	*/
 | 
			
		||||
.Ltail63:
 | 
			
		||||
	/*
 | 
			
		||||
	* Copy up to 48 bytes of data. At this point we only need the
 | 
			
		||||
	* bottom 6 bits of count to be accurate.
 | 
			
		||||
	*/
 | 
			
		||||
	ands	tmp1, count, #0x30
 | 
			
		||||
	b.eq	.Ltiny15
 | 
			
		||||
	cmp	tmp1w, #0x20
 | 
			
		||||
	b.eq	1f
 | 
			
		||||
	b.lt	2f
 | 
			
		||||
	ldp	A_l, A_h, [src], #16
 | 
			
		||||
	stp	A_l, A_h, [dst], #16
 | 
			
		||||
1:
 | 
			
		||||
	ldp	A_l, A_h, [src], #16
 | 
			
		||||
	stp	A_l, A_h, [dst], #16
 | 
			
		||||
2:
 | 
			
		||||
	ldp	A_l, A_h, [src], #16
 | 
			
		||||
	stp	A_l, A_h, [dst], #16
 | 
			
		||||
.Ltiny15:
 | 
			
		||||
	/*
 | 
			
		||||
	* Prefer to break one ldp/stp into several load/store to access
 | 
			
		||||
	* memory in an increasing address order,rather than to load/store 16
 | 
			
		||||
	* bytes from (src-16) to (dst-16) and to backward the src to aligned
 | 
			
		||||
	* address,which way is used in original cortex memcpy. If keeping
 | 
			
		||||
	* the original memcpy process here, memmove need to satisfy the
 | 
			
		||||
	* precondition that src address is at least 16 bytes bigger than dst
 | 
			
		||||
	* address,otherwise some source data will be overwritten when memove
 | 
			
		||||
	* call memcpy directly. To make memmove simpler and decouple the
 | 
			
		||||
	* memcpy's dependency on memmove, withdrew the original process.
 | 
			
		||||
	*/
 | 
			
		||||
	tbz	count, #3, 1f
 | 
			
		||||
	ldr	tmp1, [src], #8
 | 
			
		||||
	str	tmp1, [dst], #8
 | 
			
		||||
1:
 | 
			
		||||
	tbz	count, #2, 2f
 | 
			
		||||
	ldr	tmp1w, [src], #4
 | 
			
		||||
	str	tmp1w, [dst], #4
 | 
			
		||||
2:
 | 
			
		||||
	tbz	count, #1, 3f
 | 
			
		||||
	ldrh	tmp1w, [src], #2
 | 
			
		||||
	strh	tmp1w, [dst], #2
 | 
			
		||||
3:
 | 
			
		||||
	tbz	count, #0, .Lexitfunc
 | 
			
		||||
	ldrb	tmp1w, [src]
 | 
			
		||||
	strb	tmp1w, [dst]
 | 
			
		||||
 | 
			
		||||
.Lexitfunc:
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
.Lcpy_over64:
 | 
			
		||||
	subs	count, count, #128
 | 
			
		||||
	b.ge	.Lcpy_body_large
 | 
			
		||||
	/*
 | 
			
		||||
	* Less than 128 bytes to copy, so handle 64 here and then jump
 | 
			
		||||
	* to the tail.
 | 
			
		||||
	*/
 | 
			
		||||
	ldp	A_l, A_h, [src],#16
 | 
			
		||||
	stp	A_l, A_h, [dst],#16
 | 
			
		||||
	ldp	B_l, B_h, [src],#16
 | 
			
		||||
	ldp	C_l, C_h, [src],#16
 | 
			
		||||
	stp	B_l, B_h, [dst],#16
 | 
			
		||||
	stp	C_l, C_h, [dst],#16
 | 
			
		||||
	ldp	D_l, D_h, [src],#16
 | 
			
		||||
	stp	D_l, D_h, [dst],#16
 | 
			
		||||
 | 
			
		||||
	tst	count, #0x3f
 | 
			
		||||
	b.ne	.Ltail63
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	* Critical loop.  Start at a new cache line boundary.  Assuming
 | 
			
		||||
	* 64 bytes per line this ensures the entire loop is in one line.
 | 
			
		||||
	*/
 | 
			
		||||
	.p2align	L1_CACHE_SHIFT
 | 
			
		||||
.Lcpy_body_large:
 | 
			
		||||
	/* pre-get 64 bytes data. */
 | 
			
		||||
	ldp	A_l, A_h, [src],#16
 | 
			
		||||
	ldp	B_l, B_h, [src],#16
 | 
			
		||||
	ldp	C_l, C_h, [src],#16
 | 
			
		||||
	ldp	D_l, D_h, [src],#16
 | 
			
		||||
1:
 | 
			
		||||
	/*
 | 
			
		||||
	* interlace the load of next 64 bytes data block with store of the last
 | 
			
		||||
	* loaded 64 bytes data.
 | 
			
		||||
	*/
 | 
			
		||||
	stp	A_l, A_h, [dst],#16
 | 
			
		||||
	ldp	A_l, A_h, [src],#16
 | 
			
		||||
	stp	B_l, B_h, [dst],#16
 | 
			
		||||
	ldp	B_l, B_h, [src],#16
 | 
			
		||||
	stp	C_l, C_h, [dst],#16
 | 
			
		||||
	ldp	C_l, C_h, [src],#16
 | 
			
		||||
	stp	D_l, D_h, [dst],#16
 | 
			
		||||
	ldp	D_l, D_h, [src],#16
 | 
			
		||||
	subs	count, count, #64
 | 
			
		||||
	b.ge	1b
 | 
			
		||||
	stp	A_l, A_h, [dst],#16
 | 
			
		||||
	stp	B_l, B_h, [dst],#16
 | 
			
		||||
	stp	C_l, C_h, [dst],#16
 | 
			
		||||
	stp	D_l, D_h, [dst],#16
 | 
			
		||||
 | 
			
		||||
	tst	count, #0x3f
 | 
			
		||||
	b.ne	.Ltail63
 | 
			
		||||
	ret
 | 
			
		||||
ENDPROC(memcpy)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue