 280adc1951
			
		
	
	
	280adc1951
	
	
	
		
			
			This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memmove() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
		
			
				
	
	
		
			197 lines
		
	
	
	
		
			4.3 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
	
		
			4.3 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2013 ARM Ltd.
 | |
|  * Copyright (C) 2013 Linaro.
 | |
|  *
 | |
|  * This code is based on glibc cortex strings work originally authored by Linaro
 | |
|  * and re-licensed under GPLv2 for the Linux kernel. The original code can
 | |
|  * be found @
 | |
|  *
 | |
|  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 | |
|  * files/head:/src/aarch64/
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License version 2 as
 | |
|  * published by the Free Software Foundation.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public License
 | |
|  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/assembler.h>
 | |
| #include <asm/cache.h>
 | |
| 
 | |
| /*
 | |
|  * Move a buffer from src to test (alignment handled by the hardware).
 | |
|  * If dest <= src, call memcpy, otherwise copy in reverse order.
 | |
|  *
 | |
|  * Parameters:
 | |
|  *	x0 - dest
 | |
|  *	x1 - src
 | |
|  *	x2 - n
 | |
|  * Returns:
 | |
|  *	x0 - dest
 | |
|  */
 | |
| dstin	.req	x0
 | |
| src	.req	x1
 | |
| count	.req	x2
 | |
| tmp1	.req	x3
 | |
| tmp1w	.req	w3
 | |
| tmp2	.req	x4
 | |
| tmp2w	.req	w4
 | |
| tmp3	.req	x5
 | |
| tmp3w	.req	w5
 | |
| dst	.req	x6
 | |
| 
 | |
| A_l	.req	x7
 | |
| A_h	.req	x8
 | |
| B_l	.req	x9
 | |
| B_h	.req	x10
 | |
| C_l	.req	x11
 | |
| C_h	.req	x12
 | |
| D_l	.req	x13
 | |
| D_h	.req	x14
 | |
| 
 | |
| ENTRY(memmove)
 | |
| 	cmp	dstin, src
 | |
| 	b.lo	memcpy
 | |
| 	add	tmp1, src, count
 | |
| 	cmp	dstin, tmp1
 | |
| 	b.hs	memcpy		/* No overlap.  */
 | |
| 
 | |
| 	add	dst, dstin, count
 | |
| 	add	src, src, count
 | |
| 	cmp	count, #16
 | |
| 	b.lo	.Ltail15  /*probably non-alignment accesses.*/
 | |
| 
 | |
| 	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
 | |
| 	b.eq	.LSrcAligned
 | |
| 	sub	count, count, tmp2
 | |
| 	/*
 | |
| 	* process the aligned offset length to make the src aligned firstly.
 | |
| 	* those extra instructions' cost is acceptable. It also make the
 | |
| 	* coming accesses are based on aligned address.
 | |
| 	*/
 | |
| 	tbz	tmp2, #0, 1f
 | |
| 	ldrb	tmp1w, [src, #-1]!
 | |
| 	strb	tmp1w, [dst, #-1]!
 | |
| 1:
 | |
| 	tbz	tmp2, #1, 2f
 | |
| 	ldrh	tmp1w, [src, #-2]!
 | |
| 	strh	tmp1w, [dst, #-2]!
 | |
| 2:
 | |
| 	tbz	tmp2, #2, 3f
 | |
| 	ldr	tmp1w, [src, #-4]!
 | |
| 	str	tmp1w, [dst, #-4]!
 | |
| 3:
 | |
| 	tbz	tmp2, #3, .LSrcAligned
 | |
| 	ldr	tmp1, [src, #-8]!
 | |
| 	str	tmp1, [dst, #-8]!
 | |
| 
 | |
| .LSrcAligned:
 | |
| 	cmp	count, #64
 | |
| 	b.ge	.Lcpy_over64
 | |
| 
 | |
| 	/*
 | |
| 	* Deal with small copies quickly by dropping straight into the
 | |
| 	* exit block.
 | |
| 	*/
 | |
| .Ltail63:
 | |
| 	/*
 | |
| 	* Copy up to 48 bytes of data. At this point we only need the
 | |
| 	* bottom 6 bits of count to be accurate.
 | |
| 	*/
 | |
| 	ands	tmp1, count, #0x30
 | |
| 	b.eq	.Ltail15
 | |
| 	cmp	tmp1w, #0x20
 | |
| 	b.eq	1f
 | |
| 	b.lt	2f
 | |
| 	ldp	A_l, A_h, [src, #-16]!
 | |
| 	stp	A_l, A_h, [dst, #-16]!
 | |
| 1:
 | |
| 	ldp	A_l, A_h, [src, #-16]!
 | |
| 	stp	A_l, A_h, [dst, #-16]!
 | |
| 2:
 | |
| 	ldp	A_l, A_h, [src, #-16]!
 | |
| 	stp	A_l, A_h, [dst, #-16]!
 | |
| 
 | |
| .Ltail15:
 | |
| 	tbz	count, #3, 1f
 | |
| 	ldr	tmp1, [src, #-8]!
 | |
| 	str	tmp1, [dst, #-8]!
 | |
| 1:
 | |
| 	tbz	count, #2, 2f
 | |
| 	ldr	tmp1w, [src, #-4]!
 | |
| 	str	tmp1w, [dst, #-4]!
 | |
| 2:
 | |
| 	tbz	count, #1, 3f
 | |
| 	ldrh	tmp1w, [src, #-2]!
 | |
| 	strh	tmp1w, [dst, #-2]!
 | |
| 3:
 | |
| 	tbz	count, #0, .Lexitfunc
 | |
| 	ldrb	tmp1w, [src, #-1]
 | |
| 	strb	tmp1w, [dst, #-1]
 | |
| 
 | |
| .Lexitfunc:
 | |
| 	ret
 | |
| 
 | |
| .Lcpy_over64:
 | |
| 	subs	count, count, #128
 | |
| 	b.ge	.Lcpy_body_large
 | |
| 	/*
 | |
| 	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
 | |
| 	* to the tail.
 | |
| 	*/
 | |
| 	ldp	A_l, A_h, [src, #-16]
 | |
| 	stp	A_l, A_h, [dst, #-16]
 | |
| 	ldp	B_l, B_h, [src, #-32]
 | |
| 	ldp	C_l, C_h, [src, #-48]
 | |
| 	stp	B_l, B_h, [dst, #-32]
 | |
| 	stp	C_l, C_h, [dst, #-48]
 | |
| 	ldp	D_l, D_h, [src, #-64]!
 | |
| 	stp	D_l, D_h, [dst, #-64]!
 | |
| 
 | |
| 	tst	count, #0x3f
 | |
| 	b.ne	.Ltail63
 | |
| 	ret
 | |
| 
 | |
| 	/*
 | |
| 	* Critical loop. Start at a new cache line boundary. Assuming
 | |
| 	* 64 bytes per line this ensures the entire loop is in one line.
 | |
| 	*/
 | |
| 	.p2align	L1_CACHE_SHIFT
 | |
| .Lcpy_body_large:
 | |
| 	/* pre-load 64 bytes data. */
 | |
| 	ldp	A_l, A_h, [src, #-16]
 | |
| 	ldp	B_l, B_h, [src, #-32]
 | |
| 	ldp	C_l, C_h, [src, #-48]
 | |
| 	ldp	D_l, D_h, [src, #-64]!
 | |
| 1:
 | |
| 	/*
 | |
| 	* interlace the load of next 64 bytes data block with store of the last
 | |
| 	* loaded 64 bytes data.
 | |
| 	*/
 | |
| 	stp	A_l, A_h, [dst, #-16]
 | |
| 	ldp	A_l, A_h, [src, #-16]
 | |
| 	stp	B_l, B_h, [dst, #-32]
 | |
| 	ldp	B_l, B_h, [src, #-32]
 | |
| 	stp	C_l, C_h, [dst, #-48]
 | |
| 	ldp	C_l, C_h, [src, #-48]
 | |
| 	stp	D_l, D_h, [dst, #-64]!
 | |
| 	ldp	D_l, D_h, [src, #-64]!
 | |
| 	subs	count, count, #64
 | |
| 	b.ge	1b
 | |
| 	stp	A_l, A_h, [dst, #-16]
 | |
| 	stp	B_l, B_h, [dst, #-32]
 | |
| 	stp	C_l, C_h, [dst, #-48]
 | |
| 	stp	D_l, D_h, [dst, #-64]!
 | |
| 
 | |
| 	tst	count, #0x3f
 | |
| 	b.ne	.Ltail63
 | |
| 	ret
 | |
| ENDPROC(memmove)
 |