Optimised version of memset for the SH4 which uses movca.l. Signed-off-by: Stuart Menefy <stuart.menefy@st.com> Signed-off-by: Paul Mundt <lethal@linux-sh.org>
		
			
				
	
	
		
			107 lines
		
	
	
	
		
			1.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			107 lines
		
	
	
	
		
			1.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*
 | 
						|
 * "memset" implementation for SH4
 | 
						|
 *
 | 
						|
 * Copyright (C) 1999  Niibe Yutaka
 | 
						|
 * Copyright (c) 2009  STMicroelectronics Limited
 | 
						|
 * Author: Stuart Menefy <stuart.menefy:st.com>
 | 
						|
 */
 | 
						|
 | 
						|
/*
 | 
						|
 *            void *memset(void *s, int c, size_t n);
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/linkage.h>
 | 
						|
 | 
						|
ENTRY(memset)
 | 
						|
	mov	#12,r0
 | 
						|
	add	r6,r4
 | 
						|
	cmp/gt	r6,r0
 | 
						|
	bt/s	40f		! if it's too small, set a byte at once
 | 
						|
	 mov	r4,r0
 | 
						|
	and	#3,r0
 | 
						|
	cmp/eq	#0,r0
 | 
						|
	bt/s	2f		! It's aligned
 | 
						|
	 sub	r0,r6
 | 
						|
1:
 | 
						|
	dt	r0
 | 
						|
	bf/s	1b
 | 
						|
	 mov.b	r5,@-r4
 | 
						|
2:				! make VVVV
 | 
						|
	extu.b	r5,r5
 | 
						|
	swap.b	r5,r0		!   V0
 | 
						|
	or	r0,r5		!   VV
 | 
						|
	swap.w	r5,r0		! VV00
 | 
						|
	or	r0,r5		! VVVV
 | 
						|
 | 
						|
	! Check if enough bytes need to be copied to be worth the big loop
 | 
						|
	mov	#0x40, r0	! (MT)
 | 
						|
	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
 | 
						|
 | 
						|
	bt/s	22f
 | 
						|
	 mov	r6,r0
 | 
						|
 | 
						|
	! align the dst to the cache block size if necessary
 | 
						|
	mov	r4, r3
 | 
						|
	mov	#~(0x1f), r1
 | 
						|
 | 
						|
	and	r3, r1
 | 
						|
	cmp/eq	r3, r1
 | 
						|
 | 
						|
	bt/s	11f		! dst is already aligned
 | 
						|
	 sub	r1, r3		! r3-r1 -> r3
 | 
						|
	shlr2	r3		! number of loops
 | 
						|
 | 
						|
10:	mov.l	r5,@-r4
 | 
						|
	dt	r3
 | 
						|
	bf/s	10b
 | 
						|
	 add	#-4, r6
 | 
						|
 | 
						|
11:	! dst is 32byte aligned
 | 
						|
	mov	r6,r2
 | 
						|
	mov	#-5,r0
 | 
						|
	shld	r0,r2		! number of loops
 | 
						|
 | 
						|
	add	#-32, r4
 | 
						|
	mov	r5, r0
 | 
						|
12:
 | 
						|
	movca.l	r0,@r4
 | 
						|
	mov.l	r5,@(4, r4)
 | 
						|
	mov.l	r5,@(8, r4)
 | 
						|
	mov.l	r5,@(12,r4)
 | 
						|
	mov.l	r5,@(16,r4)
 | 
						|
	mov.l	r5,@(20,r4)
 | 
						|
	add	#-0x20, r6
 | 
						|
	mov.l	r5,@(24,r4)
 | 
						|
	dt	r2
 | 
						|
	mov.l	r5,@(28,r4)
 | 
						|
	bf/s	12b
 | 
						|
	 add	#-32, r4
 | 
						|
 | 
						|
	add	#32, r4
 | 
						|
	mov	#8, r0
 | 
						|
	cmp/ge	r0, r6
 | 
						|
	bf	40f
 | 
						|
 | 
						|
	mov	r6,r0
 | 
						|
22:
 | 
						|
	shlr2	r0
 | 
						|
	shlr	r0		! r0 = r6 >> 3
 | 
						|
3:
 | 
						|
	dt	r0
 | 
						|
	mov.l	r5,@-r4		! set 8-byte at once
 | 
						|
	bf/s	3b
 | 
						|
	 mov.l	r5,@-r4
 | 
						|
	!
 | 
						|
	mov	#7,r0
 | 
						|
	and	r0,r6
 | 
						|
 | 
						|
	! fill bytes (length may be zero)
 | 
						|
40:	tst	r6,r6
 | 
						|
	bt	5f
 | 
						|
4:
 | 
						|
	dt	r6
 | 
						|
	bf/s	4b
 | 
						|
	 mov.b	r5,@-r4
 | 
						|
5:
 | 
						|
	rts
 | 
						|
	 mov	r4,r0
 |